1# Protocol Buffers - Google's data interchange format
2# Copyright 2008 Google Inc. All rights reserved.
3#
4# Use of this source code is governed by a BSD-style
5# license that can be found in the LICENSE file or at
6# https://developers.google.com/open-source/licenses/bsd
7
8"""Contains routines for printing protocol messages in text format.
9
10Simple usage example::
11
12 # Create a proto object and serialize it to a text proto string.
13 message = my_proto_pb2.MyMessage(foo='bar')
14 text_proto = text_format.MessageToString(message)
15
16 # Parse a text proto string.
17 message = text_format.Parse(text_proto, my_proto_pb2.MyMessage())
18"""
19
20__author__ = 'kenton@google.com (Kenton Varda)'
21
22# TODO Import thread contention leads to test failures.
23import encodings.raw_unicode_escape # pylint: disable=unused-import
24import encodings.unicode_escape # pylint: disable=unused-import
25import io
26import math
27import re
28
29from google.protobuf.internal import decoder
30from google.protobuf.internal import type_checkers
31from google.protobuf import descriptor
32from google.protobuf import text_encoding
33from google.protobuf import unknown_fields
34
35# pylint: disable=g-import-not-at-top
36__all__ = ['MessageToString', 'Parse', 'PrintMessage', 'PrintField',
37 'PrintFieldValue', 'Merge', 'MessageToBytes']
38
39_INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(),
40 type_checkers.Int32ValueChecker(),
41 type_checkers.Uint64ValueChecker(),
42 type_checkers.Int64ValueChecker())
43_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?$', re.IGNORECASE)
44_FLOAT_NAN = re.compile('nanf?$', re.IGNORECASE)
45_QUOTES = frozenset(("'", '"'))
46_ANY_FULL_TYPE_NAME = 'google.protobuf.Any'
47_DEBUG_STRING_SILENT_MARKER = '\t '
48
49_as_utf8_default = True
50
51
52class Error(Exception):
53 """Top-level module error for text_format."""
54
55
56class ParseError(Error):
57 """Thrown in case of text parsing or tokenizing error."""
58
59 def __init__(self, message=None, line=None, column=None):
60 if message is not None and line is not None:
61 loc = str(line)
62 if column is not None:
63 loc += ':{0}'.format(column)
64 message = '{0} : {1}'.format(loc, message)
65 if message is not None:
66 super(ParseError, self).__init__(message)
67 else:
68 super(ParseError, self).__init__()
69 self._line = line
70 self._column = column
71
72 def GetLine(self):
73 return self._line
74
75 def GetColumn(self):
76 return self._column
77
78
79class TextWriter(object):
80
81 def __init__(self, as_utf8):
82 self._writer = io.StringIO()
83
84 def write(self, val):
85 return self._writer.write(val)
86
87 def close(self):
88 return self._writer.close()
89
90 def getvalue(self):
91 return self._writer.getvalue()
92
93
94def MessageToString(
95 message,
96 as_utf8=_as_utf8_default,
97 as_one_line=False,
98 use_short_repeated_primitives=False,
99 pointy_brackets=False,
100 use_index_order=False,
101 float_format=None,
102 double_format=None,
103 use_field_number=False,
104 descriptor_pool=None,
105 indent=0,
106 message_formatter=None,
107 print_unknown_fields=False,
108 force_colon=False) -> str:
109 """Convert protobuf message to text format.
110
111 Double values can be formatted compactly with 15 digits of
112 precision (which is the most that IEEE 754 "double" can guarantee)
113 using double_format='.15g'. To ensure that converting to text and back to a
114 proto will result in an identical value, double_format='.17g' should be used.
115
116 Args:
117 message: The protocol buffers message.
118 as_utf8: Return unescaped Unicode for non-ASCII characters.
119 as_one_line: Don't introduce newlines between fields.
120 use_short_repeated_primitives: Use short repeated format for primitives.
121 pointy_brackets: If True, use angle brackets instead of curly braces for
122 nesting.
123 use_index_order: If True, fields of a proto message will be printed using
124 the order defined in source code instead of the field number, extensions
125 will be printed at the end of the message and their relative order is
126 determined by the extension number. By default, use the field number
127 order.
128 float_format (str): If set, use this to specify float field formatting
129 (per the "Format Specification Mini-Language"); otherwise, shortest float
130 that has same value in wire will be printed. Also affect double field
131 if double_format is not set but float_format is set.
132 double_format (str): If set, use this to specify double field formatting
133 (per the "Format Specification Mini-Language"); if it is not set but
134 float_format is set, use float_format. Otherwise, use ``str()``
135 use_field_number: If True, print field numbers instead of names.
136 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
137 indent (int): The initial indent level, in terms of spaces, for pretty
138 print.
139 message_formatter (function(message, indent, as_one_line) -> unicode|None):
140 Custom formatter for selected sub-messages (usually based on message
141 type). Use to pretty print parts of the protobuf for easier diffing.
142 print_unknown_fields: If True, unknown fields will be printed.
143 force_colon: If set, a colon will be added after the field name even if the
144 field is a proto message.
145
146 Returns:
147 str: A string of the text formatted protocol buffer message.
148 """
149 out = TextWriter(as_utf8)
150 printer = _Printer(
151 out,
152 indent,
153 as_utf8,
154 as_one_line,
155 use_short_repeated_primitives,
156 pointy_brackets,
157 use_index_order,
158 float_format,
159 double_format,
160 use_field_number,
161 descriptor_pool,
162 message_formatter,
163 print_unknown_fields=print_unknown_fields,
164 force_colon=force_colon)
165 printer.PrintMessage(message)
166 result = out.getvalue()
167 out.close()
168 if as_one_line:
169 return result.rstrip()
170 return result
171
172
173def MessageToBytes(message, **kwargs) -> bytes:
174 """Convert protobuf message to encoded text format. See MessageToString."""
175 text = MessageToString(message, **kwargs)
176 if isinstance(text, bytes):
177 return text
178 codec = 'utf-8' if kwargs.get('as_utf8') else 'ascii'
179 return text.encode(codec)
180
181
182def _IsMapEntry(field):
183 return (field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
184 field.message_type.has_options and
185 field.message_type.GetOptions().map_entry)
186
187
188def _IsGroupLike(field):
189 """Determines if a field is consistent with a proto2 group.
190
191 Args:
192 field: The field descriptor.
193
194 Returns:
195 True if this field is group-like, false otherwise.
196 """
197 # Groups are always tag-delimited.
198 if field.type != descriptor.FieldDescriptor.TYPE_GROUP:
199 return False
200
201 # Group fields always are always the lowercase type name.
202 if field.name != field.message_type.name.lower():
203 return False
204
205 if field.message_type.file != field.file:
206 return False
207
208 # Group messages are always defined in the same scope as the field. File
209 # level extensions will compare NULL == NULL here, which is why the file
210 # comparison above is necessary to ensure both come from the same file.
211 return (
212 field.message_type.containing_type == field.extension_scope
213 if field.is_extension
214 else field.message_type.containing_type == field.containing_type
215 )
216
217
218def PrintMessage(message,
219 out,
220 indent=0,
221 as_utf8=_as_utf8_default,
222 as_one_line=False,
223 use_short_repeated_primitives=False,
224 pointy_brackets=False,
225 use_index_order=False,
226 float_format=None,
227 double_format=None,
228 use_field_number=False,
229 descriptor_pool=None,
230 message_formatter=None,
231 print_unknown_fields=False,
232 force_colon=False):
233 """Convert the message to text format and write it to the out stream.
234
235 Args:
236 message: The Message object to convert to text format.
237 out: A file handle to write the message to.
238 indent: The initial indent level for pretty print.
239 as_utf8: Return unescaped Unicode for non-ASCII characters.
240 as_one_line: Don't introduce newlines between fields.
241 use_short_repeated_primitives: Use short repeated format for primitives.
242 pointy_brackets: If True, use angle brackets instead of curly braces for
243 nesting.
244 use_index_order: If True, print fields of a proto message using the order
245 defined in source code instead of the field number. By default, use the
246 field number order.
247 float_format: If set, use this to specify float field formatting
248 (per the "Format Specification Mini-Language"); otherwise, shortest
249 float that has same value in wire will be printed. Also affect double
250 field if double_format is not set but float_format is set.
251 double_format: If set, use this to specify double field formatting
252 (per the "Format Specification Mini-Language"); if it is not set but
253 float_format is set, use float_format. Otherwise, str() is used.
254 use_field_number: If True, print field numbers instead of names.
255 descriptor_pool: A DescriptorPool used to resolve Any types.
256 message_formatter: A function(message, indent, as_one_line): unicode|None
257 to custom format selected sub-messages (usually based on message type).
258 Use to pretty print parts of the protobuf for easier diffing.
259 print_unknown_fields: If True, unknown fields will be printed.
260 force_colon: If set, a colon will be added after the field name even if
261 the field is a proto message.
262 """
263 printer = _Printer(
264 out=out, indent=indent, as_utf8=as_utf8,
265 as_one_line=as_one_line,
266 use_short_repeated_primitives=use_short_repeated_primitives,
267 pointy_brackets=pointy_brackets,
268 use_index_order=use_index_order,
269 float_format=float_format,
270 double_format=double_format,
271 use_field_number=use_field_number,
272 descriptor_pool=descriptor_pool,
273 message_formatter=message_formatter,
274 print_unknown_fields=print_unknown_fields,
275 force_colon=force_colon)
276 printer.PrintMessage(message)
277
278
279def PrintField(field,
280 value,
281 out,
282 indent=0,
283 as_utf8=_as_utf8_default,
284 as_one_line=False,
285 use_short_repeated_primitives=False,
286 pointy_brackets=False,
287 use_index_order=False,
288 float_format=None,
289 double_format=None,
290 message_formatter=None,
291 print_unknown_fields=False,
292 force_colon=False):
293 """Print a single field name/value pair."""
294 printer = _Printer(out, indent, as_utf8, as_one_line,
295 use_short_repeated_primitives, pointy_brackets,
296 use_index_order, float_format, double_format,
297 message_formatter=message_formatter,
298 print_unknown_fields=print_unknown_fields,
299 force_colon=force_colon)
300 printer.PrintField(field, value)
301
302
303def PrintFieldValue(field,
304 value,
305 out,
306 indent=0,
307 as_utf8=_as_utf8_default,
308 as_one_line=False,
309 use_short_repeated_primitives=False,
310 pointy_brackets=False,
311 use_index_order=False,
312 float_format=None,
313 double_format=None,
314 message_formatter=None,
315 print_unknown_fields=False,
316 force_colon=False):
317 """Print a single field value (not including name)."""
318 printer = _Printer(out, indent, as_utf8, as_one_line,
319 use_short_repeated_primitives, pointy_brackets,
320 use_index_order, float_format, double_format,
321 message_formatter=message_formatter,
322 print_unknown_fields=print_unknown_fields,
323 force_colon=force_colon)
324 printer.PrintFieldValue(field, value)
325
326
327def _BuildMessageFromTypeName(type_name, descriptor_pool):
328 """Returns a protobuf message instance.
329
330 Args:
331 type_name: Fully-qualified protobuf message type name string.
332 descriptor_pool: DescriptorPool instance.
333
334 Returns:
335 A Message instance of type matching type_name, or None if the a Descriptor
336 wasn't found matching type_name.
337 """
338 # pylint: disable=g-import-not-at-top
339 if descriptor_pool is None:
340 from google.protobuf import descriptor_pool as pool_mod
341 descriptor_pool = pool_mod.Default()
342 from google.protobuf import message_factory
343 try:
344 message_descriptor = descriptor_pool.FindMessageTypeByName(type_name)
345 except KeyError:
346 return None
347 message_type = message_factory.GetMessageClass(message_descriptor)
348 return message_type()
349
350
351# These values must match WireType enum in //google/protobuf/wire_format.h.
352WIRETYPE_LENGTH_DELIMITED = 2
353WIRETYPE_START_GROUP = 3
354
355
356class _Printer(object):
357 """Text format printer for protocol message."""
358
359 def __init__(
360 self,
361 out,
362 indent=0,
363 as_utf8=_as_utf8_default,
364 as_one_line=False,
365 use_short_repeated_primitives=False,
366 pointy_brackets=False,
367 use_index_order=False,
368 float_format=None,
369 double_format=None,
370 use_field_number=False,
371 descriptor_pool=None,
372 message_formatter=None,
373 print_unknown_fields=False,
374 force_colon=False):
375 """Initialize the Printer.
376
377 Double values can be formatted compactly with 15 digits of precision
378 (which is the most that IEEE 754 "double" can guarantee) using
379 double_format='.15g'. To ensure that converting to text and back to a proto
380 will result in an identical value, double_format='.17g' should be used.
381
382 Args:
383 out: To record the text format result.
384 indent: The initial indent level for pretty print.
385 as_utf8: Return unescaped Unicode for non-ASCII characters.
386 as_one_line: Don't introduce newlines between fields.
387 use_short_repeated_primitives: Use short repeated format for primitives.
388 pointy_brackets: If True, use angle brackets instead of curly braces for
389 nesting.
390 use_index_order: If True, print fields of a proto message using the order
391 defined in source code instead of the field number. By default, use the
392 field number order.
393 float_format: If set, use this to specify float field formatting
394 (per the "Format Specification Mini-Language"); otherwise, shortest
395 float that has same value in wire will be printed. Also affect double
396 field if double_format is not set but float_format is set.
397 double_format: If set, use this to specify double field formatting
398 (per the "Format Specification Mini-Language"); if it is not set but
399 float_format is set, use float_format. Otherwise, str() is used.
400 use_field_number: If True, print field numbers instead of names.
401 descriptor_pool: A DescriptorPool used to resolve Any types.
402 message_formatter: A function(message, indent, as_one_line): unicode|None
403 to custom format selected sub-messages (usually based on message type).
404 Use to pretty print parts of the protobuf for easier diffing.
405 print_unknown_fields: If True, unknown fields will be printed.
406 force_colon: If set, a colon will be added after the field name even if
407 the field is a proto message.
408 """
409 self.out = out
410 self.indent = indent
411 self.as_utf8 = as_utf8
412 self.as_one_line = as_one_line
413 self.use_short_repeated_primitives = use_short_repeated_primitives
414 self.pointy_brackets = pointy_brackets
415 self.use_index_order = use_index_order
416 self.float_format = float_format
417 if double_format is not None:
418 self.double_format = double_format
419 else:
420 self.double_format = float_format
421 self.use_field_number = use_field_number
422 self.descriptor_pool = descriptor_pool
423 self.message_formatter = message_formatter
424 self.print_unknown_fields = print_unknown_fields
425 self.force_colon = force_colon
426
427 def _TryPrintAsAnyMessage(self, message):
428 """Serializes if message is a google.protobuf.Any field."""
429 if '/' not in message.type_url:
430 return False
431 packed_message = _BuildMessageFromTypeName(message.TypeName(),
432 self.descriptor_pool)
433 if packed_message:
434 packed_message.MergeFromString(message.value)
435 colon = ':' if self.force_colon else ''
436 self.out.write('%s[%s]%s ' % (self.indent * ' ', message.type_url, colon))
437 self._PrintMessageFieldValue(packed_message)
438 self.out.write(' ' if self.as_one_line else '\n')
439 return True
440 else:
441 return False
442
443 def _TryCustomFormatMessage(self, message):
444 formatted = self.message_formatter(message, self.indent, self.as_one_line)
445 if formatted is None:
446 return False
447
448 out = self.out
449 out.write(' ' * self.indent)
450 out.write(formatted)
451 out.write(' ' if self.as_one_line else '\n')
452 return True
453
454 def PrintMessage(self, message):
455 """Convert protobuf message to text format.
456
457 Args:
458 message: The protocol buffers message.
459 """
460 if self.message_formatter and self._TryCustomFormatMessage(message):
461 return
462 if (message.DESCRIPTOR.full_name == _ANY_FULL_TYPE_NAME and
463 self._TryPrintAsAnyMessage(message)):
464 return
465 fields = message.ListFields()
466 if self.use_index_order:
467 fields.sort(
468 key=lambda x: x[0].number if x[0].is_extension else x[0].index)
469 for field, value in fields:
470 if _IsMapEntry(field):
471 for key in sorted(value):
472 # This is slow for maps with submessage entries because it copies the
473 # entire tree. Unfortunately this would take significant refactoring
474 # of this file to work around.
475 #
476 # TODO: refactor and optimize if this becomes an issue.
477 entry_submsg = value.GetEntryClass()(key=key, value=value[key])
478 self.PrintField(field, entry_submsg)
479 elif field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
480 if (self.use_short_repeated_primitives
481 and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE
482 and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_STRING):
483 self._PrintShortRepeatedPrimitivesValue(field, value)
484 else:
485 for element in value:
486 self.PrintField(field, element)
487 else:
488 self.PrintField(field, value)
489
490 if self.print_unknown_fields:
491 self._PrintUnknownFields(unknown_fields.UnknownFieldSet(message))
492
493 def _PrintUnknownFields(self, unknown_field_set):
494 """Print unknown fields."""
495 out = self.out
496 for field in unknown_field_set:
497 out.write(' ' * self.indent)
498 out.write(str(field.field_number))
499 if field.wire_type == WIRETYPE_START_GROUP:
500 if self.as_one_line:
501 out.write(' { ')
502 else:
503 out.write(' {\n')
504 self.indent += 2
505
506 self._PrintUnknownFields(field.data)
507
508 if self.as_one_line:
509 out.write('} ')
510 else:
511 self.indent -= 2
512 out.write(' ' * self.indent + '}\n')
513 elif field.wire_type == WIRETYPE_LENGTH_DELIMITED:
514 try:
515 # If this field is parseable as a Message, it is probably
516 # an embedded message.
517 # pylint: disable=protected-access
518 (embedded_unknown_message, pos) = decoder._DecodeUnknownFieldSet(
519 memoryview(field.data), 0, len(field.data))
520 except Exception: # pylint: disable=broad-except
521 pos = 0
522
523 if pos == len(field.data):
524 if self.as_one_line:
525 out.write(' { ')
526 else:
527 out.write(' {\n')
528 self.indent += 2
529
530 self._PrintUnknownFields(embedded_unknown_message)
531
532 if self.as_one_line:
533 out.write('} ')
534 else:
535 self.indent -= 2
536 out.write(' ' * self.indent + '}\n')
537 else:
538 # A string or bytes field. self.as_utf8 may not work.
539 out.write(': \"')
540 out.write(text_encoding.CEscape(field.data, False))
541 out.write('\" ' if self.as_one_line else '\"\n')
542 else:
543 # varint, fixed32, fixed64
544 out.write(': ')
545 out.write(str(field.data))
546 out.write(' ' if self.as_one_line else '\n')
547
548 def _PrintFieldName(self, field):
549 """Print field name."""
550 out = self.out
551 out.write(' ' * self.indent)
552 if self.use_field_number:
553 out.write(str(field.number))
554 else:
555 if field.is_extension:
556 out.write('[')
557 if (field.containing_type.GetOptions().message_set_wire_format and
558 field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
559 field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL):
560 out.write(field.message_type.full_name)
561 else:
562 out.write(field.full_name)
563 out.write(']')
564 elif _IsGroupLike(field):
565 # For groups, use the capitalized name.
566 out.write(field.message_type.name)
567 else:
568 out.write(field.name)
569
570 if (self.force_colon or
571 field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE):
572 # The colon is optional in this case, but our cross-language golden files
573 # don't include it. Here, the colon is only included if force_colon is
574 # set to True
575 out.write(':')
576
577 def PrintField(self, field, value):
578 """Print a single field name/value pair."""
579 self._PrintFieldName(field)
580 self.out.write(' ')
581 self.PrintFieldValue(field, value)
582 self.out.write(' ' if self.as_one_line else '\n')
583
584 def _PrintShortRepeatedPrimitivesValue(self, field, value):
585 """"Prints short repeated primitives value."""
586 # Note: this is called only when value has at least one element.
587 self._PrintFieldName(field)
588 self.out.write(' [')
589 for i in range(len(value) - 1):
590 self.PrintFieldValue(field, value[i])
591 self.out.write(', ')
592 self.PrintFieldValue(field, value[-1])
593 self.out.write(']')
594 self.out.write(' ' if self.as_one_line else '\n')
595
596 def _PrintMessageFieldValue(self, value):
597 if self.pointy_brackets:
598 openb = '<'
599 closeb = '>'
600 else:
601 openb = '{'
602 closeb = '}'
603
604 if self.as_one_line:
605 self.out.write('%s ' % openb)
606 self.PrintMessage(value)
607 self.out.write(closeb)
608 else:
609 self.out.write('%s\n' % openb)
610 self.indent += 2
611 self.PrintMessage(value)
612 self.indent -= 2
613 self.out.write(' ' * self.indent + closeb)
614
615 def PrintFieldValue(self, field, value):
616 """Print a single field value (not including name).
617
618 For repeated fields, the value should be a single element.
619
620 Args:
621 field: The descriptor of the field to be printed.
622 value: The value of the field.
623 """
624 out = self.out
625 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
626 self._PrintMessageFieldValue(value)
627 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM:
628 enum_value = field.enum_type.values_by_number.get(value, None)
629 if enum_value is not None:
630 out.write(enum_value.name)
631 else:
632 out.write(str(value))
633 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING:
634 out.write('\"')
635 if isinstance(value, str) and not self.as_utf8:
636 out_value = value.encode('utf-8')
637 else:
638 out_value = value
639 if field.type == descriptor.FieldDescriptor.TYPE_BYTES:
640 # We always need to escape all binary data in TYPE_BYTES fields.
641 out_as_utf8 = False
642 else:
643 out_as_utf8 = self.as_utf8
644 out.write(text_encoding.CEscape(out_value, out_as_utf8))
645 out.write('\"')
646 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL:
647 if value:
648 out.write('true')
649 else:
650 out.write('false')
651 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_FLOAT:
652 if self.float_format is not None:
653 out.write('{1:{0}}'.format(self.float_format, value))
654 else:
655 if math.isnan(value):
656 out.write(str(value))
657 else:
658 out.write(str(type_checkers.ToShortestFloat(value)))
659 elif (field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_DOUBLE and
660 self.double_format is not None):
661 out.write('{1:{0}}'.format(self.double_format, value))
662 else:
663 out.write(str(value))
664
665
666def Parse(text,
667 message,
668 allow_unknown_extension=False,
669 allow_field_number=False,
670 descriptor_pool=None,
671 allow_unknown_field=False):
672 """Parses a text representation of a protocol message into a message.
673
674 NOTE: for historical reasons this function does not clear the input
675 message. This is different from what the binary msg.ParseFrom(...) does.
676 If text contains a field already set in message, the value is appended if the
677 field is repeated. Otherwise, an error is raised.
678
679 Example::
680
681 a = MyProto()
682 a.repeated_field.append('test')
683 b = MyProto()
684
685 # Repeated fields are combined
686 text_format.Parse(repr(a), b)
687 text_format.Parse(repr(a), b) # repeated_field contains ["test", "test"]
688
689 # Non-repeated fields cannot be overwritten
690 a.singular_field = 1
691 b.singular_field = 2
692 text_format.Parse(repr(a), b) # ParseError
693
694 # Binary version:
695 b.ParseFromString(a.SerializeToString()) # repeated_field is now "test"
696
697 Caller is responsible for clearing the message as needed.
698
699 Args:
700 text (str): Message text representation.
701 message (Message): A protocol buffer message to merge into.
702 allow_unknown_extension: if True, skip over missing extensions and keep
703 parsing
704 allow_field_number: if True, both field number and field name are allowed.
705 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
706 allow_unknown_field: if True, skip over unknown field and keep
707 parsing. Avoid to use this option if possible. It may hide some
708 errors (e.g. spelling error on field name)
709
710 Returns:
711 Message: The same message passed as argument.
712
713 Raises:
714 ParseError: On text parsing problems.
715 """
716 return ParseLines(text.split(b'\n' if isinstance(text, bytes) else u'\n'),
717 message,
718 allow_unknown_extension,
719 allow_field_number,
720 descriptor_pool=descriptor_pool,
721 allow_unknown_field=allow_unknown_field)
722
723
724def Merge(text,
725 message,
726 allow_unknown_extension=False,
727 allow_field_number=False,
728 descriptor_pool=None,
729 allow_unknown_field=False):
730 """Parses a text representation of a protocol message into a message.
731
732 Like Parse(), but allows repeated values for a non-repeated field, and uses
733 the last one. This means any non-repeated, top-level fields specified in text
734 replace those in the message.
735
736 Args:
737 text (str): Message text representation.
738 message (Message): A protocol buffer message to merge into.
739 allow_unknown_extension: if True, skip over missing extensions and keep
740 parsing
741 allow_field_number: if True, both field number and field name are allowed.
742 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
743 allow_unknown_field: if True, skip over unknown field and keep
744 parsing. Avoid to use this option if possible. It may hide some
745 errors (e.g. spelling error on field name)
746
747 Returns:
748 Message: The same message passed as argument.
749
750 Raises:
751 ParseError: On text parsing problems.
752 """
753 return MergeLines(
754 text.split(b'\n' if isinstance(text, bytes) else u'\n'),
755 message,
756 allow_unknown_extension,
757 allow_field_number,
758 descriptor_pool=descriptor_pool,
759 allow_unknown_field=allow_unknown_field)
760
761
762def ParseLines(lines,
763 message,
764 allow_unknown_extension=False,
765 allow_field_number=False,
766 descriptor_pool=None,
767 allow_unknown_field=False):
768 """Parses a text representation of a protocol message into a message.
769
770 See Parse() for caveats.
771
772 Args:
773 lines: An iterable of lines of a message's text representation.
774 message: A protocol buffer message to merge into.
775 allow_unknown_extension: if True, skip over missing extensions and keep
776 parsing
777 allow_field_number: if True, both field number and field name are allowed.
778 descriptor_pool: A DescriptorPool used to resolve Any types.
779 allow_unknown_field: if True, skip over unknown field and keep
780 parsing. Avoid to use this option if possible. It may hide some
781 errors (e.g. spelling error on field name)
782
783 Returns:
784 The same message passed as argument.
785
786 Raises:
787 ParseError: On text parsing problems.
788 """
789 parser = _Parser(allow_unknown_extension,
790 allow_field_number,
791 descriptor_pool=descriptor_pool,
792 allow_unknown_field=allow_unknown_field)
793 return parser.ParseLines(lines, message)
794
795
796def MergeLines(lines,
797 message,
798 allow_unknown_extension=False,
799 allow_field_number=False,
800 descriptor_pool=None,
801 allow_unknown_field=False):
802 """Parses a text representation of a protocol message into a message.
803
804 See Merge() for more details.
805
806 Args:
807 lines: An iterable of lines of a message's text representation.
808 message: A protocol buffer message to merge into.
809 allow_unknown_extension: if True, skip over missing extensions and keep
810 parsing
811 allow_field_number: if True, both field number and field name are allowed.
812 descriptor_pool: A DescriptorPool used to resolve Any types.
813 allow_unknown_field: if True, skip over unknown field and keep
814 parsing. Avoid to use this option if possible. It may hide some
815 errors (e.g. spelling error on field name)
816
817 Returns:
818 The same message passed as argument.
819
820 Raises:
821 ParseError: On text parsing problems.
822 """
823 parser = _Parser(allow_unknown_extension,
824 allow_field_number,
825 descriptor_pool=descriptor_pool,
826 allow_unknown_field=allow_unknown_field)
827 return parser.MergeLines(lines, message)
828
829
830class _Parser(object):
831 """Text format parser for protocol message."""
832
833 def __init__(self,
834 allow_unknown_extension=False,
835 allow_field_number=False,
836 descriptor_pool=None,
837 allow_unknown_field=False):
838 self.allow_unknown_extension = allow_unknown_extension
839 self.allow_field_number = allow_field_number
840 self.descriptor_pool = descriptor_pool
841 self.allow_unknown_field = allow_unknown_field
842
843 def ParseLines(self, lines, message):
844 """Parses a text representation of a protocol message into a message."""
845 self._allow_multiple_scalars = False
846 self._ParseOrMerge(lines, message)
847 return message
848
849 def MergeLines(self, lines, message):
850 """Merges a text representation of a protocol message into a message."""
851 self._allow_multiple_scalars = True
852 self._ParseOrMerge(lines, message)
853 return message
854
855 def _ParseOrMerge(self, lines, message):
856 """Converts a text representation of a protocol message into a message.
857
858 Args:
859 lines: Lines of a message's text representation.
860 message: A protocol buffer message to merge into.
861
862 Raises:
863 ParseError: On text parsing problems.
864 """
865 # Tokenize expects native str lines.
866 try:
867 str_lines = (
868 line if isinstance(line, str) else line.decode('utf-8')
869 for line in lines)
870 tokenizer = Tokenizer(str_lines)
871 except UnicodeDecodeError as e:
872 raise ParseError from e
873 if message:
874 self.root_type = message.DESCRIPTOR.full_name
875 while not tokenizer.AtEnd():
876 self._MergeField(tokenizer, message)
877
878 def _MergeField(self, tokenizer, message):
879 """Merges a single protocol message field into a message.
880
881 Args:
882 tokenizer: A tokenizer to parse the field name and values.
883 message: A protocol message to record the data.
884
885 Raises:
886 ParseError: In case of text parsing problems.
887 """
888 message_descriptor = message.DESCRIPTOR
889 if (message_descriptor.full_name == _ANY_FULL_TYPE_NAME and
890 tokenizer.TryConsume('[')):
891 type_url_prefix, packed_type_name = self._ConsumeAnyTypeUrl(tokenizer)
892 tokenizer.Consume(']')
893 tokenizer.TryConsume(':')
894 self._DetectSilentMarker(tokenizer, message_descriptor.full_name,
895 type_url_prefix + '/' + packed_type_name)
896 if tokenizer.TryConsume('<'):
897 expanded_any_end_token = '>'
898 else:
899 tokenizer.Consume('{')
900 expanded_any_end_token = '}'
901 expanded_any_sub_message = _BuildMessageFromTypeName(packed_type_name,
902 self.descriptor_pool)
903 # Direct comparison with None is used instead of implicit bool conversion
904 # to avoid false positives with falsy initial values, e.g. for
905 # google.protobuf.ListValue.
906 if expanded_any_sub_message is None:
907 raise ParseError('Type %s not found in descriptor pool' %
908 packed_type_name)
909 while not tokenizer.TryConsume(expanded_any_end_token):
910 if tokenizer.AtEnd():
911 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' %
912 (expanded_any_end_token,))
913 self._MergeField(tokenizer, expanded_any_sub_message)
914 deterministic = False
915
916 message.Pack(expanded_any_sub_message,
917 type_url_prefix=type_url_prefix,
918 deterministic=deterministic)
919 return
920
921 if tokenizer.TryConsume('['):
922 name = [tokenizer.ConsumeIdentifier()]
923 while tokenizer.TryConsume('.'):
924 name.append(tokenizer.ConsumeIdentifier())
925 name = '.'.join(name)
926
927 if not message_descriptor.is_extendable:
928 raise tokenizer.ParseErrorPreviousToken(
929 'Message type "%s" does not have extensions.' %
930 message_descriptor.full_name)
931 # pylint: disable=protected-access
932 field = message.Extensions._FindExtensionByName(name)
933 # pylint: enable=protected-access
934 if not field:
935 if self.allow_unknown_extension:
936 field = None
937 else:
938 raise tokenizer.ParseErrorPreviousToken(
939 'Extension "%s" not registered. '
940 'Did you import the _pb2 module which defines it? '
941 'If you are trying to place the extension in the MessageSet '
942 'field of another message that is in an Any or MessageSet field, '
943 'that message\'s _pb2 module must be imported as well' % name)
944 elif message_descriptor != field.containing_type:
945 raise tokenizer.ParseErrorPreviousToken(
946 'Extension "%s" does not extend message type "%s".' %
947 (name, message_descriptor.full_name))
948
949 tokenizer.Consume(']')
950
951 else:
952 name = tokenizer.ConsumeIdentifierOrNumber()
953 if self.allow_field_number and name.isdigit():
954 number = ParseInteger(name, True, True)
955 field = message_descriptor.fields_by_number.get(number, None)
956 if not field and message_descriptor.is_extendable:
957 field = message.Extensions._FindExtensionByNumber(number)
958 else:
959 field = message_descriptor.fields_by_name.get(name, None)
960
961 # Group names are expected to be capitalized as they appear in the
962 # .proto file, which actually matches their type names, not their field
963 # names.
964 if not field:
965 field = message_descriptor.fields_by_name.get(name.lower(), None)
966 if field and not _IsGroupLike(field):
967 field = None
968 if field and field.message_type.name != name:
969 field = None
970
971 if not field and not self.allow_unknown_field:
972 raise tokenizer.ParseErrorPreviousToken(
973 'Message type "%s" has no field named "%s".' %
974 (message_descriptor.full_name, name))
975
976 if field:
977 if not self._allow_multiple_scalars and field.containing_oneof:
978 # Check if there's a different field set in this oneof.
979 # Note that we ignore the case if the same field was set before, and we
980 # apply _allow_multiple_scalars to non-scalar fields as well.
981 which_oneof = message.WhichOneof(field.containing_oneof.name)
982 if which_oneof is not None and which_oneof != field.name:
983 raise tokenizer.ParseErrorPreviousToken(
984 'Field "%s" is specified along with field "%s", another member '
985 'of oneof "%s" for message type "%s".' %
986 (field.name, which_oneof, field.containing_oneof.name,
987 message_descriptor.full_name))
988
989 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
990 tokenizer.TryConsume(':')
991 self._DetectSilentMarker(tokenizer, message_descriptor.full_name,
992 field.full_name)
993 merger = self._MergeMessageField
994 else:
995 tokenizer.Consume(':')
996 self._DetectSilentMarker(tokenizer, message_descriptor.full_name,
997 field.full_name)
998 merger = self._MergeScalarField
999
1000 if (field.label == descriptor.FieldDescriptor.LABEL_REPEATED and
1001 tokenizer.TryConsume('[')):
1002 # Short repeated format, e.g. "foo: [1, 2, 3]"
1003 if not tokenizer.TryConsume(']'):
1004 while True:
1005 merger(tokenizer, message, field)
1006 if tokenizer.TryConsume(']'):
1007 break
1008 tokenizer.Consume(',')
1009
1010 else:
1011 merger(tokenizer, message, field)
1012
1013 else: # Proto field is unknown.
1014 assert (self.allow_unknown_extension or self.allow_unknown_field)
1015 self._SkipFieldContents(tokenizer, name, message_descriptor.full_name)
1016
1017 # For historical reasons, fields may optionally be separated by commas or
1018 # semicolons.
1019 if not tokenizer.TryConsume(','):
1020 tokenizer.TryConsume(';')
1021
1022 def _LogSilentMarker(self, immediate_message_type, field_name):
1023 pass
1024
1025 def _DetectSilentMarker(self, tokenizer, immediate_message_type, field_name):
1026 if tokenizer.contains_silent_marker_before_current_token:
1027 self._LogSilentMarker(immediate_message_type, field_name)
1028
1029 def _ConsumeAnyTypeUrl(self, tokenizer):
1030 """Consumes a google.protobuf.Any type URL and returns the type name."""
1031 # Consume "type.googleapis.com/".
1032 prefix = [tokenizer.ConsumeIdentifier()]
1033 tokenizer.Consume('.')
1034 prefix.append(tokenizer.ConsumeIdentifier())
1035 tokenizer.Consume('.')
1036 prefix.append(tokenizer.ConsumeIdentifier())
1037 tokenizer.Consume('/')
1038 # Consume the fully-qualified type name.
1039 name = [tokenizer.ConsumeIdentifier()]
1040 while tokenizer.TryConsume('.'):
1041 name.append(tokenizer.ConsumeIdentifier())
1042 return '.'.join(prefix), '.'.join(name)
1043
1044 def _MergeMessageField(self, tokenizer, message, field):
1045 """Merges a single scalar field into a message.
1046
1047 Args:
1048 tokenizer: A tokenizer to parse the field value.
1049 message: The message of which field is a member.
1050 field: The descriptor of the field to be merged.
1051
1052 Raises:
1053 ParseError: In case of text parsing problems.
1054 """
1055 is_map_entry = _IsMapEntry(field)
1056
1057 if tokenizer.TryConsume('<'):
1058 end_token = '>'
1059 else:
1060 tokenizer.Consume('{')
1061 end_token = '}'
1062
1063 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
1064 if field.is_extension:
1065 sub_message = message.Extensions[field].add()
1066 elif is_map_entry:
1067 sub_message = getattr(message, field.name).GetEntryClass()()
1068 else:
1069 sub_message = getattr(message, field.name).add()
1070 else:
1071 if field.is_extension:
1072 if (not self._allow_multiple_scalars and
1073 message.HasExtension(field)):
1074 raise tokenizer.ParseErrorPreviousToken(
1075 'Message type "%s" should not have multiple "%s" extensions.' %
1076 (message.DESCRIPTOR.full_name, field.full_name))
1077 sub_message = message.Extensions[field]
1078 else:
1079 # Also apply _allow_multiple_scalars to message field.
1080 # TODO: Change to _allow_singular_overwrites.
1081 if (not self._allow_multiple_scalars and
1082 message.HasField(field.name)):
1083 raise tokenizer.ParseErrorPreviousToken(
1084 'Message type "%s" should not have multiple "%s" fields.' %
1085 (message.DESCRIPTOR.full_name, field.name))
1086 sub_message = getattr(message, field.name)
1087 sub_message.SetInParent()
1088
1089 while not tokenizer.TryConsume(end_token):
1090 if tokenizer.AtEnd():
1091 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token,))
1092 self._MergeField(tokenizer, sub_message)
1093
1094 if is_map_entry:
1095 value_cpptype = field.message_type.fields_by_name['value'].cpp_type
1096 if value_cpptype == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
1097 value = getattr(message, field.name)[sub_message.key]
1098 value.CopyFrom(sub_message.value)
1099 else:
1100 getattr(message, field.name)[sub_message.key] = sub_message.value
1101
1102 def _MergeScalarField(self, tokenizer, message, field):
1103 """Merges a single scalar field into a message.
1104
1105 Args:
1106 tokenizer: A tokenizer to parse the field value.
1107 message: A protocol message to record the data.
1108 field: The descriptor of the field to be merged.
1109
1110 Raises:
1111 ParseError: In case of text parsing problems.
1112 RuntimeError: On runtime errors.
1113 """
1114 _ = self.allow_unknown_extension
1115 value = None
1116
1117 if field.type in (descriptor.FieldDescriptor.TYPE_INT32,
1118 descriptor.FieldDescriptor.TYPE_SINT32,
1119 descriptor.FieldDescriptor.TYPE_SFIXED32):
1120 value = _ConsumeInt32(tokenizer)
1121 elif field.type in (descriptor.FieldDescriptor.TYPE_INT64,
1122 descriptor.FieldDescriptor.TYPE_SINT64,
1123 descriptor.FieldDescriptor.TYPE_SFIXED64):
1124 value = _ConsumeInt64(tokenizer)
1125 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32,
1126 descriptor.FieldDescriptor.TYPE_FIXED32):
1127 value = _ConsumeUint32(tokenizer)
1128 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64,
1129 descriptor.FieldDescriptor.TYPE_FIXED64):
1130 value = _ConsumeUint64(tokenizer)
1131 elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT,
1132 descriptor.FieldDescriptor.TYPE_DOUBLE):
1133 value = tokenizer.ConsumeFloat()
1134 elif field.type == descriptor.FieldDescriptor.TYPE_BOOL:
1135 value = tokenizer.ConsumeBool()
1136 elif field.type == descriptor.FieldDescriptor.TYPE_STRING:
1137 value = tokenizer.ConsumeString()
1138 elif field.type == descriptor.FieldDescriptor.TYPE_BYTES:
1139 value = tokenizer.ConsumeByteString()
1140 elif field.type == descriptor.FieldDescriptor.TYPE_ENUM:
1141 value = tokenizer.ConsumeEnum(field)
1142 else:
1143 raise RuntimeError('Unknown field type %d' % field.type)
1144
1145 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
1146 if field.is_extension:
1147 message.Extensions[field].append(value)
1148 else:
1149 getattr(message, field.name).append(value)
1150 else:
1151 if field.is_extension:
1152 if (not self._allow_multiple_scalars and
1153 field.has_presence and
1154 message.HasExtension(field)):
1155 raise tokenizer.ParseErrorPreviousToken(
1156 'Message type "%s" should not have multiple "%s" extensions.' %
1157 (message.DESCRIPTOR.full_name, field.full_name))
1158 else:
1159 message.Extensions[field] = value
1160 else:
1161 duplicate_error = False
1162 if not self._allow_multiple_scalars:
1163 if field.has_presence:
1164 duplicate_error = message.HasField(field.name)
1165 else:
1166 # For field that doesn't represent presence, try best effort to
1167 # check multiple scalars by compare to default values.
1168 duplicate_error = bool(getattr(message, field.name))
1169
1170 if duplicate_error:
1171 raise tokenizer.ParseErrorPreviousToken(
1172 'Message type "%s" should not have multiple "%s" fields.' %
1173 (message.DESCRIPTOR.full_name, field.name))
1174 else:
1175 setattr(message, field.name, value)
1176
1177 def _SkipFieldContents(self, tokenizer, field_name, immediate_message_type):
1178 """Skips over contents (value or message) of a field.
1179
1180 Args:
1181 tokenizer: A tokenizer to parse the field name and values.
1182 field_name: The field name currently being parsed.
1183 immediate_message_type: The type of the message immediately containing
1184 the silent marker.
1185 """
1186 # Try to guess the type of this field.
1187 # If this field is not a message, there should be a ":" between the
1188 # field name and the field value and also the field value should not
1189 # start with "{" or "<" which indicates the beginning of a message body.
1190 # If there is no ":" or there is a "{" or "<" after ":", this field has
1191 # to be a message or the input is ill-formed.
1192 if tokenizer.TryConsume(
1193 ':') and not tokenizer.LookingAt('{') and not tokenizer.LookingAt('<'):
1194 self._DetectSilentMarker(tokenizer, immediate_message_type, field_name)
1195 if tokenizer.LookingAt('['):
1196 self._SkipRepeatedFieldValue(tokenizer)
1197 else:
1198 self._SkipFieldValue(tokenizer)
1199 else:
1200 self._DetectSilentMarker(tokenizer, immediate_message_type, field_name)
1201 self._SkipFieldMessage(tokenizer, immediate_message_type)
1202
1203 def _SkipField(self, tokenizer, immediate_message_type):
1204 """Skips over a complete field (name and value/message).
1205
1206 Args:
1207 tokenizer: A tokenizer to parse the field name and values.
1208 immediate_message_type: The type of the message immediately containing
1209 the silent marker.
1210 """
1211 field_name = ''
1212 if tokenizer.TryConsume('['):
1213 # Consume extension or google.protobuf.Any type URL
1214 field_name += '[' + tokenizer.ConsumeIdentifier()
1215 num_identifiers = 1
1216 while tokenizer.TryConsume('.'):
1217 field_name += '.' + tokenizer.ConsumeIdentifier()
1218 num_identifiers += 1
1219 # This is possibly a type URL for an Any message.
1220 if num_identifiers == 3 and tokenizer.TryConsume('/'):
1221 field_name += '/' + tokenizer.ConsumeIdentifier()
1222 while tokenizer.TryConsume('.'):
1223 field_name += '.' + tokenizer.ConsumeIdentifier()
1224 tokenizer.Consume(']')
1225 field_name += ']'
1226 else:
1227 field_name += tokenizer.ConsumeIdentifierOrNumber()
1228
1229 self._SkipFieldContents(tokenizer, field_name, immediate_message_type)
1230
1231 # For historical reasons, fields may optionally be separated by commas or
1232 # semicolons.
1233 if not tokenizer.TryConsume(','):
1234 tokenizer.TryConsume(';')
1235
1236 def _SkipFieldMessage(self, tokenizer, immediate_message_type):
1237 """Skips over a field message.
1238
1239 Args:
1240 tokenizer: A tokenizer to parse the field name and values.
1241 immediate_message_type: The type of the message immediately containing
1242 the silent marker
1243 """
1244 if tokenizer.TryConsume('<'):
1245 delimiter = '>'
1246 else:
1247 tokenizer.Consume('{')
1248 delimiter = '}'
1249
1250 while not tokenizer.LookingAt('>') and not tokenizer.LookingAt('}'):
1251 self._SkipField(tokenizer, immediate_message_type)
1252
1253 tokenizer.Consume(delimiter)
1254
1255 def _SkipFieldValue(self, tokenizer):
1256 """Skips over a field value.
1257
1258 Args:
1259 tokenizer: A tokenizer to parse the field name and values.
1260
1261 Raises:
1262 ParseError: In case an invalid field value is found.
1263 """
1264 if (not tokenizer.TryConsumeByteString()and
1265 not tokenizer.TryConsumeIdentifier() and
1266 not _TryConsumeInt64(tokenizer) and
1267 not _TryConsumeUint64(tokenizer) and
1268 not tokenizer.TryConsumeFloat()):
1269 raise ParseError('Invalid field value: ' + tokenizer.token)
1270
1271 def _SkipRepeatedFieldValue(self, tokenizer):
1272 """Skips over a repeated field value.
1273
1274 Args:
1275 tokenizer: A tokenizer to parse the field value.
1276 """
1277 tokenizer.Consume('[')
1278 if not tokenizer.LookingAt(']'):
1279 self._SkipFieldValue(tokenizer)
1280 while tokenizer.TryConsume(','):
1281 self._SkipFieldValue(tokenizer)
1282 tokenizer.Consume(']')
1283
1284
1285class Tokenizer(object):
1286 """Protocol buffer text representation tokenizer.
1287
1288 This class handles the lower level string parsing by splitting it into
1289 meaningful tokens.
1290
1291 It was directly ported from the Java protocol buffer API.
1292 """
1293
1294 _WHITESPACE = re.compile(r'\s+')
1295 _COMMENT = re.compile(r'(\s*#.*$)', re.MULTILINE)
1296 _WHITESPACE_OR_COMMENT = re.compile(r'(\s|(#.*$))+', re.MULTILINE)
1297 _TOKEN = re.compile('|'.join([
1298 r'[a-zA-Z_][0-9a-zA-Z_+-]*', # an identifier
1299 r'([0-9+-]|(\.[0-9]))[0-9a-zA-Z_.+-]*', # a number
1300 ] + [ # quoted str for each quote mark
1301 # Avoid backtracking! https://stackoverflow.com/a/844267
1302 r'{qt}[^{qt}\n\\]*((\\.)+[^{qt}\n\\]*)*({qt}|\\?$)'.format(qt=mark)
1303 for mark in _QUOTES
1304 ]))
1305
1306 _IDENTIFIER = re.compile(r'[^\d\W]\w*')
1307 _IDENTIFIER_OR_NUMBER = re.compile(r'\w+')
1308
1309 def __init__(self, lines, skip_comments=True):
1310 self._position = 0
1311 self._line = -1
1312 self._column = 0
1313 self._token_start = None
1314 self.token = ''
1315 self._lines = iter(lines)
1316 self._current_line = ''
1317 self._previous_line = 0
1318 self._previous_column = 0
1319 self._more_lines = True
1320 self._skip_comments = skip_comments
1321 self._whitespace_pattern = (skip_comments and self._WHITESPACE_OR_COMMENT
1322 or self._WHITESPACE)
1323 self.contains_silent_marker_before_current_token = False
1324
1325 self._SkipWhitespace()
1326 self.NextToken()
1327
1328 def LookingAt(self, token):
1329 return self.token == token
1330
1331 def AtEnd(self):
1332 """Checks the end of the text was reached.
1333
1334 Returns:
1335 True iff the end was reached.
1336 """
1337 return not self.token
1338
1339 def _PopLine(self):
1340 while len(self._current_line) <= self._column:
1341 try:
1342 self._current_line = next(self._lines)
1343 except StopIteration:
1344 self._current_line = ''
1345 self._more_lines = False
1346 return
1347 else:
1348 self._line += 1
1349 self._column = 0
1350
1351 def _SkipWhitespace(self):
1352 while True:
1353 self._PopLine()
1354 match = self._whitespace_pattern.match(self._current_line, self._column)
1355 if not match:
1356 break
1357 self.contains_silent_marker_before_current_token = match.group(0) == (
1358 ' ' + _DEBUG_STRING_SILENT_MARKER)
1359 length = len(match.group(0))
1360 self._column += length
1361
1362 def TryConsume(self, token):
1363 """Tries to consume a given piece of text.
1364
1365 Args:
1366 token: Text to consume.
1367
1368 Returns:
1369 True iff the text was consumed.
1370 """
1371 if self.token == token:
1372 self.NextToken()
1373 return True
1374 return False
1375
1376 def Consume(self, token):
1377 """Consumes a piece of text.
1378
1379 Args:
1380 token: Text to consume.
1381
1382 Raises:
1383 ParseError: If the text couldn't be consumed.
1384 """
1385 if not self.TryConsume(token):
1386 raise self.ParseError('Expected "%s".' % token)
1387
1388 def ConsumeComment(self):
1389 result = self.token
1390 if not self._COMMENT.match(result):
1391 raise self.ParseError('Expected comment.')
1392 self.NextToken()
1393 return result
1394
1395 def ConsumeCommentOrTrailingComment(self):
1396 """Consumes a comment, returns a 2-tuple (trailing bool, comment str)."""
1397
1398 # Tokenizer initializes _previous_line and _previous_column to 0. As the
1399 # tokenizer starts, it looks like there is a previous token on the line.
1400 just_started = self._line == 0 and self._column == 0
1401
1402 before_parsing = self._previous_line
1403 comment = self.ConsumeComment()
1404
1405 # A trailing comment is a comment on the same line than the previous token.
1406 trailing = (self._previous_line == before_parsing
1407 and not just_started)
1408
1409 return trailing, comment
1410
1411 def TryConsumeIdentifier(self):
1412 try:
1413 self.ConsumeIdentifier()
1414 return True
1415 except ParseError:
1416 return False
1417
1418 def ConsumeIdentifier(self):
1419 """Consumes protocol message field identifier.
1420
1421 Returns:
1422 Identifier string.
1423
1424 Raises:
1425 ParseError: If an identifier couldn't be consumed.
1426 """
1427 result = self.token
1428 if not self._IDENTIFIER.match(result):
1429 raise self.ParseError('Expected identifier.')
1430 self.NextToken()
1431 return result
1432
1433 def TryConsumeIdentifierOrNumber(self):
1434 try:
1435 self.ConsumeIdentifierOrNumber()
1436 return True
1437 except ParseError:
1438 return False
1439
1440 def ConsumeIdentifierOrNumber(self):
1441 """Consumes protocol message field identifier.
1442
1443 Returns:
1444 Identifier string.
1445
1446 Raises:
1447 ParseError: If an identifier couldn't be consumed.
1448 """
1449 result = self.token
1450 if not self._IDENTIFIER_OR_NUMBER.match(result):
1451 raise self.ParseError('Expected identifier or number, got %s.' % result)
1452 self.NextToken()
1453 return result
1454
1455 def TryConsumeInteger(self):
1456 try:
1457 self.ConsumeInteger()
1458 return True
1459 except ParseError:
1460 return False
1461
1462 def ConsumeInteger(self):
1463 """Consumes an integer number.
1464
1465 Returns:
1466 The integer parsed.
1467
1468 Raises:
1469 ParseError: If an integer couldn't be consumed.
1470 """
1471 try:
1472 result = _ParseAbstractInteger(self.token)
1473 except ValueError as e:
1474 raise self.ParseError(str(e))
1475 self.NextToken()
1476 return result
1477
1478 def TryConsumeFloat(self):
1479 try:
1480 self.ConsumeFloat()
1481 return True
1482 except ParseError:
1483 return False
1484
1485 def ConsumeFloat(self):
1486 """Consumes an floating point number.
1487
1488 Returns:
1489 The number parsed.
1490
1491 Raises:
1492 ParseError: If a floating point number couldn't be consumed.
1493 """
1494 try:
1495 result = ParseFloat(self.token)
1496 except ValueError as e:
1497 raise self.ParseError(str(e))
1498 self.NextToken()
1499 return result
1500
1501 def ConsumeBool(self):
1502 """Consumes a boolean value.
1503
1504 Returns:
1505 The bool parsed.
1506
1507 Raises:
1508 ParseError: If a boolean value couldn't be consumed.
1509 """
1510 try:
1511 result = ParseBool(self.token)
1512 except ValueError as e:
1513 raise self.ParseError(str(e))
1514 self.NextToken()
1515 return result
1516
1517 def TryConsumeByteString(self):
1518 try:
1519 self.ConsumeByteString()
1520 return True
1521 except ParseError:
1522 return False
1523
1524 def ConsumeString(self):
1525 """Consumes a string value.
1526
1527 Returns:
1528 The string parsed.
1529
1530 Raises:
1531 ParseError: If a string value couldn't be consumed.
1532 """
1533 the_bytes = self.ConsumeByteString()
1534 try:
1535 return str(the_bytes, 'utf-8')
1536 except UnicodeDecodeError as e:
1537 raise self._StringParseError(e)
1538
1539 def ConsumeByteString(self):
1540 """Consumes a byte array value.
1541
1542 Returns:
1543 The array parsed (as a string).
1544
1545 Raises:
1546 ParseError: If a byte array value couldn't be consumed.
1547 """
1548 the_list = [self._ConsumeSingleByteString()]
1549 while self.token and self.token[0] in _QUOTES:
1550 the_list.append(self._ConsumeSingleByteString())
1551 return b''.join(the_list)
1552
1553 def _ConsumeSingleByteString(self):
1554 """Consume one token of a string literal.
1555
1556 String literals (whether bytes or text) can come in multiple adjacent
1557 tokens which are automatically concatenated, like in C or Python. This
1558 method only consumes one token.
1559
1560 Returns:
1561 The token parsed.
1562 Raises:
1563 ParseError: When the wrong format data is found.
1564 """
1565 text = self.token
1566 if len(text) < 1 or text[0] not in _QUOTES:
1567 raise self.ParseError('Expected string but found: %r' % (text,))
1568
1569 if len(text) < 2 or text[-1] != text[0]:
1570 raise self.ParseError('String missing ending quote: %r' % (text,))
1571
1572 try:
1573 result = text_encoding.CUnescape(text[1:-1])
1574 except ValueError as e:
1575 raise self.ParseError(str(e))
1576 self.NextToken()
1577 return result
1578
1579 def ConsumeEnum(self, field):
1580 try:
1581 result = ParseEnum(field, self.token)
1582 except ValueError as e:
1583 raise self.ParseError(str(e))
1584 self.NextToken()
1585 return result
1586
1587 def ParseErrorPreviousToken(self, message):
1588 """Creates and *returns* a ParseError for the previously read token.
1589
1590 Args:
1591 message: A message to set for the exception.
1592
1593 Returns:
1594 A ParseError instance.
1595 """
1596 return ParseError(message, self._previous_line + 1,
1597 self._previous_column + 1)
1598
1599 def ParseError(self, message):
1600 """Creates and *returns* a ParseError for the current token."""
1601 return ParseError('\'' + self._current_line + '\': ' + message,
1602 self._line + 1, self._column + 1)
1603
1604 def _StringParseError(self, e):
1605 return self.ParseError('Couldn\'t parse string: ' + str(e))
1606
1607 def NextToken(self):
1608 """Reads the next meaningful token."""
1609 self._previous_line = self._line
1610 self._previous_column = self._column
1611 self.contains_silent_marker_before_current_token = False
1612
1613 self._column += len(self.token)
1614 self._SkipWhitespace()
1615
1616 if not self._more_lines:
1617 self.token = ''
1618 return
1619
1620 match = self._TOKEN.match(self._current_line, self._column)
1621 if not match and not self._skip_comments:
1622 match = self._COMMENT.match(self._current_line, self._column)
1623 if match:
1624 token = match.group(0)
1625 self.token = token
1626 else:
1627 self.token = self._current_line[self._column]
1628
1629# Aliased so it can still be accessed by current visibility violators.
1630# TODO: Migrate violators to textformat_tokenizer.
1631_Tokenizer = Tokenizer # pylint: disable=invalid-name
1632
1633
1634def _ConsumeInt32(tokenizer):
1635 """Consumes a signed 32bit integer number from tokenizer.
1636
1637 Args:
1638 tokenizer: A tokenizer used to parse the number.
1639
1640 Returns:
1641 The integer parsed.
1642
1643 Raises:
1644 ParseError: If a signed 32bit integer couldn't be consumed.
1645 """
1646 return _ConsumeInteger(tokenizer, is_signed=True, is_long=False)
1647
1648
1649def _ConsumeUint32(tokenizer):
1650 """Consumes an unsigned 32bit integer number from tokenizer.
1651
1652 Args:
1653 tokenizer: A tokenizer used to parse the number.
1654
1655 Returns:
1656 The integer parsed.
1657
1658 Raises:
1659 ParseError: If an unsigned 32bit integer couldn't be consumed.
1660 """
1661 return _ConsumeInteger(tokenizer, is_signed=False, is_long=False)
1662
1663
1664def _TryConsumeInt64(tokenizer):
1665 try:
1666 _ConsumeInt64(tokenizer)
1667 return True
1668 except ParseError:
1669 return False
1670
1671
1672def _ConsumeInt64(tokenizer):
1673 """Consumes a signed 32bit integer number from tokenizer.
1674
1675 Args:
1676 tokenizer: A tokenizer used to parse the number.
1677
1678 Returns:
1679 The integer parsed.
1680
1681 Raises:
1682 ParseError: If a signed 32bit integer couldn't be consumed.
1683 """
1684 return _ConsumeInteger(tokenizer, is_signed=True, is_long=True)
1685
1686
1687def _TryConsumeUint64(tokenizer):
1688 try:
1689 _ConsumeUint64(tokenizer)
1690 return True
1691 except ParseError:
1692 return False
1693
1694
1695def _ConsumeUint64(tokenizer):
1696 """Consumes an unsigned 64bit integer number from tokenizer.
1697
1698 Args:
1699 tokenizer: A tokenizer used to parse the number.
1700
1701 Returns:
1702 The integer parsed.
1703
1704 Raises:
1705 ParseError: If an unsigned 64bit integer couldn't be consumed.
1706 """
1707 return _ConsumeInteger(tokenizer, is_signed=False, is_long=True)
1708
1709
1710def _ConsumeInteger(tokenizer, is_signed=False, is_long=False):
1711 """Consumes an integer number from tokenizer.
1712
1713 Args:
1714 tokenizer: A tokenizer used to parse the number.
1715 is_signed: True if a signed integer must be parsed.
1716 is_long: True if a long integer must be parsed.
1717
1718 Returns:
1719 The integer parsed.
1720
1721 Raises:
1722 ParseError: If an integer with given characteristics couldn't be consumed.
1723 """
1724 try:
1725 result = ParseInteger(tokenizer.token, is_signed=is_signed, is_long=is_long)
1726 except ValueError as e:
1727 raise tokenizer.ParseError(str(e))
1728 tokenizer.NextToken()
1729 return result
1730
1731
1732def ParseInteger(text, is_signed=False, is_long=False):
1733 """Parses an integer.
1734
1735 Args:
1736 text: The text to parse.
1737 is_signed: True if a signed integer must be parsed.
1738 is_long: True if a long integer must be parsed.
1739
1740 Returns:
1741 The integer value.
1742
1743 Raises:
1744 ValueError: Thrown Iff the text is not a valid integer.
1745 """
1746 # Do the actual parsing. Exception handling is propagated to caller.
1747 result = _ParseAbstractInteger(text)
1748
1749 # Check if the integer is sane. Exceptions handled by callers.
1750 checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)]
1751 checker.CheckValue(result)
1752 return result
1753
1754
1755def _ParseAbstractInteger(text):
1756 """Parses an integer without checking size/signedness.
1757
1758 Args:
1759 text: The text to parse.
1760
1761 Returns:
1762 The integer value.
1763
1764 Raises:
1765 ValueError: Thrown Iff the text is not a valid integer.
1766 """
1767 # Do the actual parsing. Exception handling is propagated to caller.
1768 orig_text = text
1769 c_octal_match = re.match(r'(-?)0(\d+)$', text)
1770 if c_octal_match:
1771 # Python 3 no longer supports 0755 octal syntax without the 'o', so
1772 # we always use the '0o' prefix for multi-digit numbers starting with 0.
1773 text = c_octal_match.group(1) + '0o' + c_octal_match.group(2)
1774 try:
1775 return int(text, 0)
1776 except ValueError:
1777 raise ValueError('Couldn\'t parse integer: %s' % orig_text)
1778
1779
1780def ParseFloat(text):
1781 """Parse a floating point number.
1782
1783 Args:
1784 text: Text to parse.
1785
1786 Returns:
1787 The number parsed.
1788
1789 Raises:
1790 ValueError: If a floating point number couldn't be parsed.
1791 """
1792 try:
1793 # Assume Python compatible syntax.
1794 return float(text)
1795 except ValueError:
1796 # Check alternative spellings.
1797 if _FLOAT_INFINITY.match(text):
1798 if text[0] == '-':
1799 return float('-inf')
1800 else:
1801 return float('inf')
1802 elif _FLOAT_NAN.match(text):
1803 return float('nan')
1804 else:
1805 # assume '1.0f' format
1806 try:
1807 return float(text.rstrip('f'))
1808 except ValueError:
1809 raise ValueError('Couldn\'t parse float: %s' % text)
1810
1811
1812def ParseBool(text):
1813 """Parse a boolean value.
1814
1815 Args:
1816 text: Text to parse.
1817
1818 Returns:
1819 Boolean values parsed
1820
1821 Raises:
1822 ValueError: If text is not a valid boolean.
1823 """
1824 if text in ('true', 't', '1', 'True'):
1825 return True
1826 elif text in ('false', 'f', '0', 'False'):
1827 return False
1828 else:
1829 raise ValueError('Expected "true" or "false".')
1830
1831
1832def ParseEnum(field, value):
1833 """Parse an enum value.
1834
1835 The value can be specified by a number (the enum value), or by
1836 a string literal (the enum name).
1837
1838 Args:
1839 field: Enum field descriptor.
1840 value: String value.
1841
1842 Returns:
1843 Enum value number.
1844
1845 Raises:
1846 ValueError: If the enum value could not be parsed.
1847 """
1848 enum_descriptor = field.enum_type
1849 try:
1850 number = int(value, 0)
1851 except ValueError:
1852 # Identifier.
1853 enum_value = enum_descriptor.values_by_name.get(value, None)
1854 if enum_value is None:
1855 raise ValueError('Enum type "%s" has no value named %s.' %
1856 (enum_descriptor.full_name, value))
1857 else:
1858 if not field.enum_type.is_closed:
1859 return number
1860 enum_value = enum_descriptor.values_by_number.get(number, None)
1861 if enum_value is None:
1862 raise ValueError('Enum type "%s" has no value with number %d.' %
1863 (enum_descriptor.full_name, number))
1864 return enum_value.number