1# protocol.py -- Shared parts of the git protocols
2# Copyright (C) 2008 John Carr <john.carr@unrouted.co.uk>
3# Copyright (C) 2008-2012 Jelmer Vernooij <jelmer@jelmer.uk>
4#
5# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
6# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
7# General Public License as public by the Free Software Foundation; version 2.0
8# or (at your option) any later version. You can redistribute it and/or
9# modify it under the terms of either of these two licenses.
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# You should have received a copy of the licenses; if not, see
18# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
19# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
20# License, Version 2.0.
21#
22
23"""Generic functions for talking the git smart server protocol."""
24
25import types
26from collections.abc import Iterable
27from io import BytesIO
28from os import SEEK_END
29from typing import Callable, Optional
30
31import dulwich
32
33from .errors import GitProtocolError, HangupException
34
35TCP_GIT_PORT = 9418
36
37# Git protocol version 0 is the original Git protocol, which lacked a
38# version number until Git protocol version 1 was introduced by Brandon
39# Williams in 2017.
40#
41# Protocol version 1 is simply the original v0 protocol with the addition of
42# a single packet line, which precedes the ref advertisement, indicating the
43# protocol version being used. This was done in preparation for protocol v2.
44#
45# Git protocol version 2 was first introduced by Brandon Williams in 2018 and
46# adds many features. See the gitprotocol-v2(5) manual page for details.
47# As of 2024, Git only implements version 2 during 'git fetch' and still uses
48# version 0 during 'git push'.
49GIT_PROTOCOL_VERSIONS = [0, 1, 2]
50DEFAULT_GIT_PROTOCOL_VERSION_FETCH = 2
51DEFAULT_GIT_PROTOCOL_VERSION_SEND = 0
52
53ZERO_SHA = b"0" * 40
54
55SINGLE_ACK = 0
56MULTI_ACK = 1
57MULTI_ACK_DETAILED = 2
58
59# pack data
60SIDE_BAND_CHANNEL_DATA = 1
61# progress messages
62SIDE_BAND_CHANNEL_PROGRESS = 2
63# fatal error message just before stream aborts
64SIDE_BAND_CHANNEL_FATAL = 3
65
66CAPABILITY_ATOMIC = b"atomic"
67CAPABILITY_DEEPEN_SINCE = b"deepen-since"
68CAPABILITY_DEEPEN_NOT = b"deepen-not"
69CAPABILITY_DEEPEN_RELATIVE = b"deepen-relative"
70CAPABILITY_DELETE_REFS = b"delete-refs"
71CAPABILITY_INCLUDE_TAG = b"include-tag"
72CAPABILITY_MULTI_ACK = b"multi_ack"
73CAPABILITY_MULTI_ACK_DETAILED = b"multi_ack_detailed"
74CAPABILITY_NO_DONE = b"no-done"
75CAPABILITY_NO_PROGRESS = b"no-progress"
76CAPABILITY_OFS_DELTA = b"ofs-delta"
77CAPABILITY_QUIET = b"quiet"
78CAPABILITY_REPORT_STATUS = b"report-status"
79CAPABILITY_SHALLOW = b"shallow"
80CAPABILITY_SIDE_BAND = b"side-band"
81CAPABILITY_SIDE_BAND_64K = b"side-band-64k"
82CAPABILITY_THIN_PACK = b"thin-pack"
83CAPABILITY_AGENT = b"agent"
84CAPABILITY_SYMREF = b"symref"
85CAPABILITY_ALLOW_TIP_SHA1_IN_WANT = b"allow-tip-sha1-in-want"
86CAPABILITY_ALLOW_REACHABLE_SHA1_IN_WANT = b"allow-reachable-sha1-in-want"
87CAPABILITY_FETCH = b"fetch"
88CAPABILITY_FILTER = b"filter"
89
90# Magic ref that is used to attach capabilities to when
91# there are no refs. Should always be ste to ZERO_SHA.
92CAPABILITIES_REF = b"capabilities^{}"
93
94COMMON_CAPABILITIES = [
95 CAPABILITY_OFS_DELTA,
96 CAPABILITY_SIDE_BAND,
97 CAPABILITY_SIDE_BAND_64K,
98 CAPABILITY_AGENT,
99 CAPABILITY_NO_PROGRESS,
100]
101KNOWN_UPLOAD_CAPABILITIES = set(
102 [
103 *COMMON_CAPABILITIES,
104 CAPABILITY_THIN_PACK,
105 CAPABILITY_MULTI_ACK,
106 CAPABILITY_MULTI_ACK_DETAILED,
107 CAPABILITY_INCLUDE_TAG,
108 CAPABILITY_DEEPEN_SINCE,
109 CAPABILITY_SYMREF,
110 CAPABILITY_SHALLOW,
111 CAPABILITY_DEEPEN_NOT,
112 CAPABILITY_DEEPEN_RELATIVE,
113 CAPABILITY_ALLOW_TIP_SHA1_IN_WANT,
114 CAPABILITY_ALLOW_REACHABLE_SHA1_IN_WANT,
115 CAPABILITY_FETCH,
116 ]
117)
118KNOWN_RECEIVE_CAPABILITIES = set(
119 [
120 *COMMON_CAPABILITIES,
121 CAPABILITY_REPORT_STATUS,
122 CAPABILITY_DELETE_REFS,
123 CAPABILITY_QUIET,
124 CAPABILITY_ATOMIC,
125 ]
126)
127
128DEPTH_INFINITE = 0x7FFFFFFF
129
130NAK_LINE = b"NAK\n"
131
132
133def agent_string() -> bytes:
134 return ("dulwich/" + ".".join(map(str, dulwich.__version__))).encode("ascii")
135
136
137def capability_agent() -> bytes:
138 return CAPABILITY_AGENT + b"=" + agent_string()
139
140
141def capability_symref(from_ref: bytes, to_ref: bytes) -> bytes:
142 return CAPABILITY_SYMREF + b"=" + from_ref + b":" + to_ref
143
144
145def extract_capability_names(capabilities: Iterable[bytes]) -> set[bytes]:
146 return {parse_capability(c)[0] for c in capabilities}
147
148
149def parse_capability(capability: bytes) -> tuple[bytes, Optional[bytes]]:
150 parts = capability.split(b"=", 1)
151 if len(parts) == 1:
152 return (parts[0], None)
153 return (parts[0], parts[1])
154
155
156def symref_capabilities(symrefs: Iterable[tuple[bytes, bytes]]) -> list[bytes]:
157 return [capability_symref(*k) for k in symrefs]
158
159
160COMMAND_DEEPEN = b"deepen"
161COMMAND_SHALLOW = b"shallow"
162COMMAND_UNSHALLOW = b"unshallow"
163COMMAND_DONE = b"done"
164COMMAND_WANT = b"want"
165COMMAND_HAVE = b"have"
166
167
168def format_cmd_pkt(cmd: bytes, *args: bytes) -> bytes:
169 return cmd + b" " + b"".join([(a + b"\0") for a in args])
170
171
172def parse_cmd_pkt(line: bytes) -> tuple[bytes, list[bytes]]:
173 splice_at = line.find(b" ")
174 cmd, args = line[:splice_at], line[splice_at + 1 :]
175 assert args[-1:] == b"\x00"
176 return cmd, args[:-1].split(b"\0")
177
178
179def pkt_line(data: Optional[bytes]) -> bytes:
180 """Wrap data in a pkt-line.
181
182 Args:
183 data: The data to wrap, as a str or None.
184 Returns: The data prefixed with its length in pkt-line format; if data was
185 None, returns the flush-pkt ('0000').
186 """
187 if data is None:
188 return b"0000"
189 return ("%04x" % (len(data) + 4)).encode("ascii") + data
190
191
192def pkt_seq(*seq: Optional[bytes]) -> bytes:
193 """Wrap a sequence of data in pkt-lines.
194
195 Args:
196 seq: An iterable of strings to wrap.
197 """
198 return b"".join([pkt_line(s) for s in seq]) + pkt_line(None)
199
200
201def filter_ref_prefix(
202 refs: dict[bytes, bytes], prefixes: Iterable[bytes]
203) -> dict[bytes, bytes]:
204 """Filter refs to only include those with a given prefix.
205
206 Args:
207 refs: A list of refs.
208 prefixes: The prefixes to filter by.
209 """
210 return {k: v for k, v in refs.items() if any(k.startswith(p) for p in prefixes)}
211
212
213class Protocol:
214 """Class for interacting with a remote git process over the wire.
215
216 Parts of the git wire protocol use 'pkt-lines' to communicate. A pkt-line
217 consists of the length of the line as a 4-byte hex string, followed by the
218 payload data. The length includes the 4-byte header. The special line
219 '0000' indicates the end of a section of input and is called a 'flush-pkt'.
220
221 For details on the pkt-line format, see the cgit distribution:
222 Documentation/technical/protocol-common.txt
223 """
224
225 def __init__(
226 self,
227 read: Callable[[int], bytes],
228 write: Callable[[bytes], Optional[int]],
229 close: Optional[Callable[[], None]] = None,
230 report_activity: Optional[Callable[[int, str], None]] = None,
231 ) -> None:
232 self.read = read
233 self.write = write
234 self._close = close
235 self.report_activity = report_activity
236 self._readahead: Optional[BytesIO] = None
237
238 def close(self) -> None:
239 if self._close:
240 self._close()
241
242 def __enter__(self) -> "Protocol":
243 return self
244
245 def __exit__(
246 self,
247 exc_type: Optional[type[BaseException]],
248 exc_val: Optional[BaseException],
249 exc_tb: Optional[types.TracebackType],
250 ) -> None:
251 self.close()
252
253 def read_pkt_line(self) -> Optional[bytes]:
254 """Reads a pkt-line from the remote git process.
255
256 This method may read from the readahead buffer; see unread_pkt_line.
257
258 Returns: The next string from the stream, without the length prefix, or
259 None for a flush-pkt ('0000') or delim-pkt ('0001').
260 """
261 if self._readahead is None:
262 read = self.read
263 else:
264 read = self._readahead.read
265 self._readahead = None
266
267 try:
268 sizestr = read(4)
269 if not sizestr:
270 raise HangupException
271 size = int(sizestr, 16)
272 if size == 0 or size == 1: # flush-pkt or delim-pkt
273 if self.report_activity:
274 self.report_activity(4, "read")
275 return None
276 if self.report_activity:
277 self.report_activity(size, "read")
278 pkt_contents = read(size - 4)
279 except ConnectionResetError as exc:
280 raise HangupException from exc
281 except OSError as exc:
282 raise GitProtocolError(str(exc)) from exc
283 else:
284 if len(pkt_contents) + 4 != size:
285 raise GitProtocolError(
286 f"Length of pkt read {len(pkt_contents) + 4:04x} does not match length prefix {size:04x}"
287 )
288 return pkt_contents
289
290 def eof(self) -> bool:
291 """Test whether the protocol stream has reached EOF.
292
293 Note that this refers to the actual stream EOF and not just a
294 flush-pkt.
295
296 Returns: True if the stream is at EOF, False otherwise.
297 """
298 try:
299 next_line = self.read_pkt_line()
300 except HangupException:
301 return True
302 self.unread_pkt_line(next_line)
303 return False
304
305 def unread_pkt_line(self, data: Optional[bytes]) -> None:
306 """Unread a single line of data into the readahead buffer.
307
308 This method can be used to unread a single pkt-line into a fixed
309 readahead buffer.
310
311 Args:
312 data: The data to unread, without the length prefix.
313
314 Raises:
315 ValueError: If more than one pkt-line is unread.
316 """
317 if self._readahead is not None:
318 raise ValueError("Attempted to unread multiple pkt-lines.")
319 self._readahead = BytesIO(pkt_line(data))
320
321 def read_pkt_seq(self) -> Iterable[bytes]:
322 """Read a sequence of pkt-lines from the remote git process.
323
324 Returns: Yields each line of data up to but not including the next
325 flush-pkt.
326 """
327 pkt = self.read_pkt_line()
328 while pkt:
329 yield pkt
330 pkt = self.read_pkt_line()
331
332 def write_pkt_line(self, line: Optional[bytes]) -> None:
333 """Sends a pkt-line to the remote git process.
334
335 Args:
336 line: A string containing the data to send, without the length
337 prefix.
338 """
339 try:
340 line = pkt_line(line)
341 self.write(line)
342 if self.report_activity:
343 self.report_activity(len(line), "write")
344 except OSError as exc:
345 raise GitProtocolError(str(exc)) from exc
346
347 def write_sideband(self, channel: int, blob: bytes) -> None:
348 """Write multiplexed data to the sideband.
349
350 Args:
351 channel: An int specifying the channel to write to.
352 blob: A blob of data (as a string) to send on this channel.
353 """
354 # a pktline can be a max of 65520. a sideband line can therefore be
355 # 65520-5 = 65515
356 # WTF: Why have the len in ASCII, but the channel in binary.
357 while blob:
358 self.write_pkt_line(bytes(bytearray([channel])) + blob[:65515])
359 blob = blob[65515:]
360
361 def send_cmd(self, cmd: bytes, *args: bytes) -> None:
362 """Send a command and some arguments to a git server.
363
364 Only used for the TCP git protocol (git://).
365
366 Args:
367 cmd: The remote service to access.
368 args: List of arguments to send to remove service.
369 """
370 self.write_pkt_line(format_cmd_pkt(cmd, *args))
371
372 def read_cmd(self) -> tuple[bytes, list[bytes]]:
373 """Read a command and some arguments from the git client.
374
375 Only used for the TCP git protocol (git://).
376
377 Returns: A tuple of (command, [list of arguments]).
378 """
379 line = self.read_pkt_line()
380 if line is None:
381 raise GitProtocolError("Expected command, got flush packet")
382 return parse_cmd_pkt(line)
383
384
385_RBUFSIZE = 65536 # 64KB buffer for better network I/O performance
386
387
388class ReceivableProtocol(Protocol):
389 """Variant of Protocol that allows reading up to a size without blocking.
390
391 This class has a recv() method that behaves like socket.recv() in addition
392 to a read() method.
393
394 If you want to read n bytes from the wire and block until exactly n bytes
395 (or EOF) are read, use read(n). If you want to read at most n bytes from
396 the wire but don't care if you get less, use recv(n). Note that recv(n)
397 will still block until at least one byte is read.
398 """
399
400 def __init__(
401 self,
402 recv: Callable[[int], bytes],
403 write: Callable[[bytes], Optional[int]],
404 close: Optional[Callable[[], None]] = None,
405 report_activity: Optional[Callable[[int, str], None]] = None,
406 rbufsize: int = _RBUFSIZE,
407 ) -> None:
408 super().__init__(self.read, write, close=close, report_activity=report_activity)
409 self._recv = recv
410 self._rbuf = BytesIO()
411 self._rbufsize = rbufsize
412
413 def read(self, size: int) -> bytes:
414 # From _fileobj.read in socket.py in the Python 2.6.5 standard library,
415 # with the following modifications:
416 # - omit the size <= 0 branch
417 # - seek back to start rather than 0 in case some buffer has been
418 # consumed.
419 # - use SEEK_END instead of the magic number.
420 # Copyright (c) 2001-2010 Python Software Foundation; All Rights
421 # Reserved
422 # Licensed under the Python Software Foundation License.
423 # TODO: see if buffer is more efficient than cBytesIO.
424 assert size > 0
425
426 # Our use of BytesIO rather than lists of string objects returned by
427 # recv() minimizes memory usage and fragmentation that occurs when
428 # rbufsize is large compared to the typical return value of recv().
429 buf = self._rbuf
430 start = buf.tell()
431 buf.seek(0, SEEK_END)
432 # buffer may have been partially consumed by recv()
433 buf_len = buf.tell() - start
434 if buf_len >= size:
435 # Already have size bytes in our buffer? Extract and return.
436 buf.seek(start)
437 rv = buf.read(size)
438 self._rbuf = BytesIO()
439 self._rbuf.write(buf.read())
440 self._rbuf.seek(0)
441 return rv
442
443 self._rbuf = BytesIO() # reset _rbuf. we consume it via buf.
444 while True:
445 left = size - buf_len
446 # recv() will malloc the amount of memory given as its
447 # parameter even though it often returns much less data
448 # than that. The returned data string is short lived
449 # as we copy it into a BytesIO and free it. This avoids
450 # fragmentation issues on many platforms.
451 data = self._recv(left)
452 if not data:
453 break
454 n = len(data)
455 if n == size and not buf_len:
456 # Shortcut. Avoid buffer data copies when:
457 # - We have no data in our buffer.
458 # AND
459 # - Our call to recv returned exactly the
460 # number of bytes we were asked to read.
461 return data
462 if n == left:
463 buf.write(data)
464 del data # explicit free
465 break
466 assert n <= left, f"_recv({left}) returned {n} bytes"
467 buf.write(data)
468 buf_len += n
469 del data # explicit free
470 # assert buf_len == buf.tell()
471 buf.seek(start)
472 return buf.read()
473
474 def recv(self, size: int) -> bytes:
475 assert size > 0
476
477 buf = self._rbuf
478 start = buf.tell()
479 buf.seek(0, SEEK_END)
480 buf_len = buf.tell()
481 buf.seek(start)
482
483 left = buf_len - start
484 if not left:
485 # only read from the wire if our read buffer is exhausted
486 data = self._recv(self._rbufsize)
487 if len(data) == size:
488 # shortcut: skip the buffer if we read exactly size bytes
489 return data
490 buf = BytesIO()
491 buf.write(data)
492 buf.seek(0)
493 del data # explicit free
494 self._rbuf = buf
495 return buf.read(size)
496
497
498def extract_capabilities(text: bytes) -> tuple[bytes, list[bytes]]:
499 """Extract a capabilities list from a string, if present.
500
501 Args:
502 text: String to extract from
503 Returns: Tuple with text with capabilities removed and list of capabilities
504 """
505 if b"\0" not in text:
506 return text, []
507 text, capabilities = text.rstrip().split(b"\0")
508 return (text, capabilities.strip().split(b" "))
509
510
511def extract_want_line_capabilities(text: bytes) -> tuple[bytes, list[bytes]]:
512 """Extract a capabilities list from a want line, if present.
513
514 Note that want lines have capabilities separated from the rest of the line
515 by a space instead of a null byte. Thus want lines have the form:
516
517 want obj-id cap1 cap2 ...
518
519 Args:
520 text: Want line to extract from
521 Returns: Tuple with text with capabilities removed and list of capabilities
522 """
523 split_text = text.rstrip().split(b" ")
524 if len(split_text) < 3:
525 return text, []
526 return (b" ".join(split_text[:2]), split_text[2:])
527
528
529def ack_type(capabilities: Iterable[bytes]) -> int:
530 """Extract the ack type from a capabilities list."""
531 if b"multi_ack_detailed" in capabilities:
532 return MULTI_ACK_DETAILED
533 elif b"multi_ack" in capabilities:
534 return MULTI_ACK
535 return SINGLE_ACK
536
537
538class BufferedPktLineWriter:
539 """Writer that wraps its data in pkt-lines and has an independent buffer.
540
541 Consecutive calls to write() wrap the data in a pkt-line and then buffers
542 it until enough lines have been written such that their total length
543 (including length prefix) reach the buffer size.
544 """
545
546 def __init__(
547 self, write: Callable[[bytes], Optional[int]], bufsize: int = 65515
548 ) -> None:
549 """Initialize the BufferedPktLineWriter.
550
551 Args:
552 write: A write callback for the underlying writer.
553 bufsize: The internal buffer size, including length prefixes.
554 """
555 self._write = write
556 self._bufsize = bufsize
557 self._wbuf = BytesIO()
558 self._buflen = 0
559
560 def write(self, data: bytes) -> None:
561 """Write data, wrapping it in a pkt-line."""
562 line = pkt_line(data)
563 line_len = len(line)
564 over = self._buflen + line_len - self._bufsize
565 if over >= 0:
566 start = line_len - over
567 self._wbuf.write(line[:start])
568 self.flush()
569 else:
570 start = 0
571 saved = line[start:]
572 self._wbuf.write(saved)
573 self._buflen += len(saved)
574
575 def flush(self) -> None:
576 """Flush all data from the buffer."""
577 data = self._wbuf.getvalue()
578 if data:
579 self._write(data)
580 self._len = 0
581 self._wbuf = BytesIO()
582
583
584class PktLineParser:
585 """Packet line parser that hands completed packets off to a callback."""
586
587 def __init__(self, handle_pkt: Callable[[Optional[bytes]], None]) -> None:
588 self.handle_pkt = handle_pkt
589 self._readahead = BytesIO()
590
591 def parse(self, data: bytes) -> None:
592 """Parse a fragment of data and call back for any completed packets."""
593 self._readahead.write(data)
594 buf = self._readahead.getvalue()
595 if len(buf) < 4:
596 return
597 while len(buf) >= 4:
598 size = int(buf[:4], 16)
599 if size == 0:
600 self.handle_pkt(None)
601 buf = buf[4:]
602 elif size <= len(buf):
603 self.handle_pkt(buf[4:size])
604 buf = buf[size:]
605 else:
606 break
607 self._readahead = BytesIO()
608 self._readahead.write(buf)
609
610 def get_tail(self) -> bytes:
611 """Read back any unused data."""
612 return self._readahead.getvalue()
613
614
615def format_capability_line(capabilities: Iterable[bytes]) -> bytes:
616 return b"".join([b" " + c for c in capabilities])
617
618
619def format_ref_line(
620 ref: bytes, sha: bytes, capabilities: Optional[list[bytes]] = None
621) -> bytes:
622 if capabilities is None:
623 return sha + b" " + ref + b"\n"
624 else:
625 return sha + b" " + ref + b"\0" + format_capability_line(capabilities) + b"\n"
626
627
628def format_shallow_line(sha: bytes) -> bytes:
629 return COMMAND_SHALLOW + b" " + sha
630
631
632def format_unshallow_line(sha: bytes) -> bytes:
633 return COMMAND_UNSHALLOW + b" " + sha
634
635
636def format_ack_line(sha: bytes, ack_type: bytes = b"") -> bytes:
637 if ack_type:
638 ack_type = b" " + ack_type
639 return b"ACK " + sha + ack_type + b"\n"