1from struct import unpack
2
3
4class BinaryDecoder:
5 """Decoder for the avro binary format.
6
7 NOTE: All attributes and methods on this class should be considered
8 private.
9
10 Parameters
11 ----------
12 fo: file-like
13 Input stream
14
15 """
16
17 def __init__(self, fo):
18 self.fo = fo
19
20 def read_null(self):
21 """null is written as zero bytes."""
22 return None
23
24 def read_boolean(self):
25 """A boolean is written as a single byte whose value is either 0
26 (false) or 1 (true).
27 """
28
29 # technically 0x01 == true and 0x00 == false, but many languages will
30 # cast anything other than 0 to True and only 0 to False
31 return unpack("B", self.fo.read(1))[0] != 0
32
33 def read_long(self):
34 """int and long values are written using variable-length, zig-zag
35 coding."""
36 c = self.fo.read(1)
37
38 # We do EOF checking only here, since most reader start here
39 if not c:
40 raise EOFError
41
42 b = ord(c)
43 n = b & 0x7F
44 shift = 7
45
46 while (b & 0x80) != 0:
47 b = ord(self.fo.read(1))
48 n |= (b & 0x7F) << shift
49 shift += 7
50
51 return (n >> 1) ^ -(n & 1)
52
53 read_int = read_long
54
55 def read_float(self):
56 """A float is written as 4 bytes.
57
58 The float is converted into a 32-bit integer using a method equivalent
59 to Java's floatToIntBits and then encoded in little-endian format.
60 """
61 return unpack("<f", self.fo.read(4))[0]
62
63 def read_double(self):
64 """A double is written as 8 bytes.
65
66 The double is converted into a 64-bit integer using a method equivalent
67 to Java's doubleToLongBits and then encoded in little-endian format.
68 """
69 return unpack("<d", self.fo.read(8))[0]
70
71 def read_bytes(self):
72 """Bytes are encoded as a long followed by that many bytes of data."""
73 size = self.read_long()
74 out = self.fo.read(size)
75 if len(out) != size:
76 raise EOFError(f"Expected {size} bytes, read {len(out)}")
77 return out
78
79 def read_utf8(self, handle_unicode_errors="strict"):
80 """A string is encoded as a long followed by that many bytes of UTF-8
81 encoded character data.
82 """
83 return self.read_bytes().decode(errors=handle_unicode_errors)
84
85 def read_fixed(self, size):
86 """Fixed instances are encoded using the number of bytes declared in the
87 schema."""
88 out = self.fo.read(size)
89 if len(out) < size:
90 raise EOFError(f"Expected {size} bytes, read {len(out)}")
91 return out
92
93 def read_enum(self):
94 """An enum is encoded by a int, representing the zero-based position of the
95 symbol in the schema.
96 """
97 return self.read_long()
98
99 def read_array_start(self):
100 """Arrays are encoded as a series of blocks."""
101 self._block_count = self.read_long()
102
103 def read_array_end(self):
104 pass
105
106 def _iter_array_or_map(self):
107 """Each block consists of a long count value, followed by that many
108 array items. A block with count zero indicates the end of the array.
109 Each item is encoded per the array's item schema.
110
111 If a block's count is negative, then the count is followed immediately
112 by a long block size, indicating the number of bytes in the block.
113 The actual count in this case is the absolute value of the count
114 written.
115 """
116 while self._block_count != 0:
117 if self._block_count < 0:
118 self._block_count = -self._block_count
119 # Read block size, unused
120 self.read_long()
121
122 for i in range(self._block_count):
123 yield
124 self._block_count = self.read_long()
125
126 iter_array = _iter_array_or_map
127 iter_map = _iter_array_or_map
128
129 def read_map_start(self):
130 """Maps are encoded as a series of blocks."""
131 self._block_count = self.read_long()
132
133 def read_map_end(self):
134 pass
135
136 def read_index(self):
137 """A union is encoded by first writing a long value indicating the
138 zero-based position within the union of the schema of its value.
139
140 The value is then encoded per the indicated schema within the union.
141 """
142 return self.read_long()