1# Protocol Buffers - Google's data interchange format
2# Copyright 2008 Google Inc. All rights reserved.
3#
4# Use of this source code is governed by a BSD-style
5# license that can be found in the LICENSE file or at
6# https://developers.google.com/open-source/licenses/bsd
7
8"""Contains routines for printing protocol messages in text format.
9
10Simple usage example::
11
12 # Create a proto object and serialize it to a text proto string.
13 message = my_proto_pb2.MyMessage(foo='bar')
14 text_proto = text_format.MessageToString(message)
15
16 # Parse a text proto string.
17 message = text_format.Parse(text_proto, my_proto_pb2.MyMessage())
18"""
19
20__author__ = 'kenton@google.com (Kenton Varda)'
21
22# TODO Import thread contention leads to test failures.
23import encodings.raw_unicode_escape # pylint: disable=unused-import
24import encodings.unicode_escape # pylint: disable=unused-import
25import io
26import math
27import re
28import warnings
29
30from google.protobuf.internal import decoder
31from google.protobuf.internal import type_checkers
32from google.protobuf import descriptor
33from google.protobuf import text_encoding
34from google.protobuf import unknown_fields
35
36# pylint: disable=g-import-not-at-top
37__all__ = ['MessageToString', 'Parse', 'PrintMessage', 'PrintField',
38 'PrintFieldValue', 'Merge', 'MessageToBytes']
39
40_INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(),
41 type_checkers.Int32ValueChecker(),
42 type_checkers.Uint64ValueChecker(),
43 type_checkers.Int64ValueChecker())
44_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?$', re.IGNORECASE)
45_FLOAT_NAN = re.compile('nanf?$', re.IGNORECASE)
46_FLOAT_OCTAL_PREFIX = re.compile('-?0[0-9]+')
47_QUOTES = frozenset(("'", '"'))
48_ANY_FULL_TYPE_NAME = 'google.protobuf.Any'
49_DEBUG_STRING_SILENT_MARKER = '\t '
50
51_as_utf8_default = True
52
53
54class Error(Exception):
55 """Top-level module error for text_format."""
56
57
58class ParseError(Error):
59 """Thrown in case of text parsing or tokenizing error."""
60
61 def __init__(self, message=None, line=None, column=None):
62 if message is not None and line is not None:
63 loc = str(line)
64 if column is not None:
65 loc += ':{0}'.format(column)
66 message = '{0} : {1}'.format(loc, message)
67 if message is not None:
68 super(ParseError, self).__init__(message)
69 else:
70 super(ParseError, self).__init__()
71 self._line = line
72 self._column = column
73
74 def GetLine(self):
75 return self._line
76
77 def GetColumn(self):
78 return self._column
79
80
81class TextWriter(object):
82
83 def __init__(self, as_utf8):
84 self._writer = io.StringIO()
85
86 def write(self, val):
87 return self._writer.write(val)
88
89 def close(self):
90 return self._writer.close()
91
92 def getvalue(self):
93 return self._writer.getvalue()
94
95
96def MessageToString(
97 message,
98 as_utf8=_as_utf8_default,
99 as_one_line=False,
100 use_short_repeated_primitives=False,
101 pointy_brackets=False,
102 use_index_order=False,
103 float_format=None,
104 double_format=None,
105 use_field_number=False,
106 descriptor_pool=None,
107 indent=0,
108 message_formatter=None,
109 print_unknown_fields=False,
110 force_colon=False) -> str:
111 """Convert protobuf message to text format.
112
113 Double values can be formatted compactly with 15 digits of
114 precision (which is the most that IEEE 754 "double" can guarantee)
115 using double_format='.15g'. To ensure that converting to text and back to a
116 proto will result in an identical value, double_format='.17g' should be used.
117
118 Args:
119 message: The protocol buffers message.
120 as_utf8: Return unescaped Unicode for non-ASCII characters.
121 as_one_line: Don't introduce newlines between fields.
122 use_short_repeated_primitives: Use short repeated format for primitives.
123 pointy_brackets: If True, use angle brackets instead of curly braces for
124 nesting.
125 use_index_order: If True, fields of a proto message will be printed using
126 the order defined in source code instead of the field number, extensions
127 will be printed at the end of the message and their relative order is
128 determined by the extension number. By default, use the field number
129 order.
130 float_format (str): Deprecated. If set, use this to specify float field
131 formatting (per the "Format Specification Mini-Language"); otherwise,
132 shortest float that has same value in wire will be printed. Also affect
133 double field if double_format is not set but float_format is set.
134 double_format (str): Deprecated. If set, use this to specify double field
135 formatting (per the "Format Specification Mini-Language"); if it is not
136 set but float_format is set, use float_format. Otherwise, use ``str()``
137 use_field_number: If True, print field numbers instead of names.
138 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
139 indent (int): The initial indent level, in terms of spaces, for pretty
140 print.
141 message_formatter (function(message, indent, as_one_line) -> unicode|None):
142 Custom formatter for selected sub-messages (usually based on message
143 type). Use to pretty print parts of the protobuf for easier diffing.
144 print_unknown_fields: If True, unknown fields will be printed.
145 force_colon: If set, a colon will be added after the field name even if the
146 field is a proto message.
147
148 Returns:
149 str: A string of the text formatted protocol buffer message.
150 """
151 out = TextWriter(as_utf8)
152 printer = _Printer(
153 out,
154 indent,
155 as_utf8,
156 as_one_line,
157 use_short_repeated_primitives,
158 pointy_brackets,
159 use_index_order,
160 float_format,
161 double_format,
162 use_field_number,
163 descriptor_pool,
164 message_formatter,
165 print_unknown_fields=print_unknown_fields,
166 force_colon=force_colon)
167 printer.PrintMessage(message)
168 result = out.getvalue()
169 out.close()
170 if as_one_line:
171 return result.rstrip()
172 return result
173
174
175def MessageToBytes(message, **kwargs) -> bytes:
176 """Convert protobuf message to encoded text format. See MessageToString."""
177 text = MessageToString(message, **kwargs)
178 if isinstance(text, bytes):
179 return text
180 codec = 'utf-8' if kwargs.get('as_utf8') else 'ascii'
181 return text.encode(codec)
182
183
184def _IsMapEntry(field):
185 return (field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
186 field.message_type.has_options and
187 field.message_type.GetOptions().map_entry)
188
189
190def _IsGroupLike(field):
191 """Determines if a field is consistent with a proto2 group.
192
193 Args:
194 field: The field descriptor.
195
196 Returns:
197 True if this field is group-like, false otherwise.
198 """
199 # Groups are always tag-delimited.
200 if field.type != descriptor.FieldDescriptor.TYPE_GROUP:
201 return False
202
203 # Group fields always are always the lowercase type name.
204 if field.name != field.message_type.name.lower():
205 return False
206
207 if field.message_type.file != field.file:
208 return False
209
210 # Group messages are always defined in the same scope as the field. File
211 # level extensions will compare NULL == NULL here, which is why the file
212 # comparison above is necessary to ensure both come from the same file.
213 return (
214 field.message_type.containing_type == field.extension_scope
215 if field.is_extension
216 else field.message_type.containing_type == field.containing_type
217 )
218
219
220def PrintMessage(message,
221 out,
222 indent=0,
223 as_utf8=_as_utf8_default,
224 as_one_line=False,
225 use_short_repeated_primitives=False,
226 pointy_brackets=False,
227 use_index_order=False,
228 float_format=None,
229 double_format=None,
230 use_field_number=False,
231 descriptor_pool=None,
232 message_formatter=None,
233 print_unknown_fields=False,
234 force_colon=False):
235 """Convert the message to text format and write it to the out stream.
236
237 Args:
238 message: The Message object to convert to text format.
239 out: A file handle to write the message to.
240 indent: The initial indent level for pretty print.
241 as_utf8: Return unescaped Unicode for non-ASCII characters.
242 as_one_line: Don't introduce newlines between fields.
243 use_short_repeated_primitives: Use short repeated format for primitives.
244 pointy_brackets: If True, use angle brackets instead of curly braces for
245 nesting.
246 use_index_order: If True, print fields of a proto message using the order
247 defined in source code instead of the field number. By default, use the
248 field number order.
249 float_format: If set, use this to specify float field formatting
250 (per the "Format Specification Mini-Language"); otherwise, shortest
251 float that has same value in wire will be printed. Also affect double
252 field if double_format is not set but float_format is set.
253 double_format: If set, use this to specify double field formatting
254 (per the "Format Specification Mini-Language"); if it is not set but
255 float_format is set, use float_format. Otherwise, str() is used.
256 use_field_number: If True, print field numbers instead of names.
257 descriptor_pool: A DescriptorPool used to resolve Any types.
258 message_formatter: A function(message, indent, as_one_line): unicode|None
259 to custom format selected sub-messages (usually based on message type).
260 Use to pretty print parts of the protobuf for easier diffing.
261 print_unknown_fields: If True, unknown fields will be printed.
262 force_colon: If set, a colon will be added after the field name even if
263 the field is a proto message.
264 """
265 printer = _Printer(
266 out=out, indent=indent, as_utf8=as_utf8,
267 as_one_line=as_one_line,
268 use_short_repeated_primitives=use_short_repeated_primitives,
269 pointy_brackets=pointy_brackets,
270 use_index_order=use_index_order,
271 float_format=float_format,
272 double_format=double_format,
273 use_field_number=use_field_number,
274 descriptor_pool=descriptor_pool,
275 message_formatter=message_formatter,
276 print_unknown_fields=print_unknown_fields,
277 force_colon=force_colon)
278 printer.PrintMessage(message)
279
280
281def PrintField(field,
282 value,
283 out,
284 indent=0,
285 as_utf8=_as_utf8_default,
286 as_one_line=False,
287 use_short_repeated_primitives=False,
288 pointy_brackets=False,
289 use_index_order=False,
290 float_format=None,
291 double_format=None,
292 message_formatter=None,
293 print_unknown_fields=False,
294 force_colon=False):
295 """Print a single field name/value pair."""
296 printer = _Printer(out, indent, as_utf8, as_one_line,
297 use_short_repeated_primitives, pointy_brackets,
298 use_index_order, float_format, double_format,
299 message_formatter=message_formatter,
300 print_unknown_fields=print_unknown_fields,
301 force_colon=force_colon)
302 printer.PrintField(field, value)
303
304
305def PrintFieldValue(field,
306 value,
307 out,
308 indent=0,
309 as_utf8=_as_utf8_default,
310 as_one_line=False,
311 use_short_repeated_primitives=False,
312 pointy_brackets=False,
313 use_index_order=False,
314 float_format=None,
315 double_format=None,
316 message_formatter=None,
317 print_unknown_fields=False,
318 force_colon=False):
319 """Print a single field value (not including name)."""
320 printer = _Printer(out, indent, as_utf8, as_one_line,
321 use_short_repeated_primitives, pointy_brackets,
322 use_index_order, float_format, double_format,
323 message_formatter=message_formatter,
324 print_unknown_fields=print_unknown_fields,
325 force_colon=force_colon)
326 printer.PrintFieldValue(field, value)
327
328
329def _BuildMessageFromTypeName(type_name, descriptor_pool):
330 """Returns a protobuf message instance.
331
332 Args:
333 type_name: Fully-qualified protobuf message type name string.
334 descriptor_pool: DescriptorPool instance.
335
336 Returns:
337 A Message instance of type matching type_name, or None if the a Descriptor
338 wasn't found matching type_name.
339 """
340 # pylint: disable=g-import-not-at-top
341 if descriptor_pool is None:
342 from google.protobuf import descriptor_pool as pool_mod
343 descriptor_pool = pool_mod.Default()
344 from google.protobuf import message_factory
345 try:
346 message_descriptor = descriptor_pool.FindMessageTypeByName(type_name)
347 except KeyError:
348 return None
349 message_type = message_factory.GetMessageClass(message_descriptor)
350 return message_type()
351
352
353# These values must match WireType enum in //google/protobuf/wire_format.h.
354WIRETYPE_LENGTH_DELIMITED = 2
355WIRETYPE_START_GROUP = 3
356
357
358class _Printer(object):
359 """Text format printer for protocol message."""
360
361 def __init__(
362 self,
363 out,
364 indent=0,
365 as_utf8=_as_utf8_default,
366 as_one_line=False,
367 use_short_repeated_primitives=False,
368 pointy_brackets=False,
369 use_index_order=False,
370 float_format=None,
371 double_format=None,
372 use_field_number=False,
373 descriptor_pool=None,
374 message_formatter=None,
375 print_unknown_fields=False,
376 force_colon=False):
377 """Initialize the Printer.
378
379 Double values can be formatted compactly with 15 digits of precision
380 (which is the most that IEEE 754 "double" can guarantee) using
381 double_format='.15g'. To ensure that converting to text and back to a proto
382 will result in an identical value, double_format='.17g' should be used.
383
384 Args:
385 out: To record the text format result.
386 indent: The initial indent level for pretty print.
387 as_utf8: Return unescaped Unicode for non-ASCII characters.
388 as_one_line: Don't introduce newlines between fields.
389 use_short_repeated_primitives: Use short repeated format for primitives.
390 pointy_brackets: If True, use angle brackets instead of curly braces for
391 nesting.
392 use_index_order: If True, print fields of a proto message using the order
393 defined in source code instead of the field number. By default, use the
394 field number order.
395 float_format: Deprecated. If set, use this to specify float field
396 formatting (per the "Format Specification Mini-Language"); otherwise,
397 shortest float that has same value in wire will be printed. Also affect
398 double field if double_format is not set but float_format is set.
399 double_format: Deprecated. If set, use this to specify double field
400 formatting (per the "Format Specification Mini-Language"); if it is not
401 set but float_format is set, use float_format. Otherwise, str() is used.
402 use_field_number: If True, print field numbers instead of names.
403 descriptor_pool: A DescriptorPool used to resolve Any types.
404 message_formatter: A function(message, indent, as_one_line): unicode|None
405 to custom format selected sub-messages (usually based on message type).
406 Use to pretty print parts of the protobuf for easier diffing.
407 print_unknown_fields: If True, unknown fields will be printed.
408 force_colon: If set, a colon will be added after the field name even if
409 the field is a proto message.
410 """
411 self.out = out
412 self.indent = indent
413 self.as_utf8 = as_utf8
414 self.as_one_line = as_one_line
415 self.use_short_repeated_primitives = use_short_repeated_primitives
416 self.pointy_brackets = pointy_brackets
417 self.use_index_order = use_index_order
418 self.float_format = float_format
419 if double_format is not None:
420 warnings.warn(
421 'double_format is deprecated for text_format. This will '
422 'turn into error in 7.34.0, please remove it before that.'
423 )
424 self.double_format = double_format
425 else:
426 self.double_format = float_format
427 self.use_field_number = use_field_number
428 self.descriptor_pool = descriptor_pool
429 self.message_formatter = message_formatter
430 self.print_unknown_fields = print_unknown_fields
431 self.force_colon = force_colon
432
433 def _TryPrintAsAnyMessage(self, message):
434 """Serializes if message is a google.protobuf.Any field."""
435 if '/' not in message.type_url:
436 return False
437 packed_message = _BuildMessageFromTypeName(message.TypeName(),
438 self.descriptor_pool)
439 if packed_message is not None:
440 packed_message.MergeFromString(message.value)
441 colon = ':' if self.force_colon else ''
442 self.out.write('%s[%s]%s ' % (self.indent * ' ', message.type_url, colon))
443 self._PrintMessageFieldValue(packed_message)
444 self.out.write(' ' if self.as_one_line else '\n')
445 return True
446 else:
447 return False
448
449 def _TryCustomFormatMessage(self, message):
450 formatted = self.message_formatter(message, self.indent, self.as_one_line)
451 if formatted is None:
452 return False
453
454 out = self.out
455 out.write(' ' * self.indent)
456 out.write(formatted)
457 out.write(' ' if self.as_one_line else '\n')
458 return True
459
460 def PrintMessage(self, message):
461 """Convert protobuf message to text format.
462
463 Args:
464 message: The protocol buffers message.
465 """
466 if self.message_formatter and self._TryCustomFormatMessage(message):
467 return
468 if (message.DESCRIPTOR.full_name == _ANY_FULL_TYPE_NAME and
469 self._TryPrintAsAnyMessage(message)):
470 return
471 fields = message.ListFields()
472 if self.use_index_order:
473 fields.sort(
474 key=lambda x: x[0].number if x[0].is_extension else x[0].index)
475 for field, value in fields:
476 if _IsMapEntry(field):
477 for key in sorted(value):
478 # This is slow for maps with submessage entries because it copies the
479 # entire tree. Unfortunately this would take significant refactoring
480 # of this file to work around.
481 #
482 # TODO: refactor and optimize if this becomes an issue.
483 entry_submsg = value.GetEntryClass()(key=key, value=value[key])
484 self.PrintField(field, entry_submsg)
485 elif field.is_repeated:
486 if (self.use_short_repeated_primitives
487 and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE
488 and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_STRING):
489 self._PrintShortRepeatedPrimitivesValue(field, value)
490 else:
491 for element in value:
492 self.PrintField(field, element)
493 else:
494 self.PrintField(field, value)
495
496 if self.print_unknown_fields:
497 self._PrintUnknownFields(unknown_fields.UnknownFieldSet(message))
498
499 def _PrintUnknownFields(self, unknown_field_set):
500 """Print unknown fields."""
501 out = self.out
502 for field in unknown_field_set:
503 out.write(' ' * self.indent)
504 out.write(str(field.field_number))
505 if field.wire_type == WIRETYPE_START_GROUP:
506 if self.as_one_line:
507 out.write(' { ')
508 else:
509 out.write(' {\n')
510 self.indent += 2
511
512 self._PrintUnknownFields(field.data)
513
514 if self.as_one_line:
515 out.write('} ')
516 else:
517 self.indent -= 2
518 out.write(' ' * self.indent + '}\n')
519 elif field.wire_type == WIRETYPE_LENGTH_DELIMITED:
520 try:
521 # If this field is parseable as a Message, it is probably
522 # an embedded message.
523 # pylint: disable=protected-access
524 (embedded_unknown_message, pos) = decoder._DecodeUnknownFieldSet(
525 memoryview(field.data), 0, len(field.data))
526 except Exception: # pylint: disable=broad-except
527 pos = 0
528
529 if pos == len(field.data):
530 if self.as_one_line:
531 out.write(' { ')
532 else:
533 out.write(' {\n')
534 self.indent += 2
535
536 self._PrintUnknownFields(embedded_unknown_message)
537
538 if self.as_one_line:
539 out.write('} ')
540 else:
541 self.indent -= 2
542 out.write(' ' * self.indent + '}\n')
543 else:
544 # A string or bytes field. self.as_utf8 may not work.
545 out.write(': \"')
546 out.write(text_encoding.CEscape(field.data, False))
547 out.write('\" ' if self.as_one_line else '\"\n')
548 else:
549 # varint, fixed32, fixed64
550 out.write(': ')
551 out.write(str(field.data))
552 out.write(' ' if self.as_one_line else '\n')
553
554 def _PrintFieldName(self, field):
555 """Print field name."""
556 out = self.out
557 out.write(' ' * self.indent)
558 if self.use_field_number:
559 out.write(str(field.number))
560 else:
561 if field.is_extension:
562 out.write('[')
563 if (field.containing_type.GetOptions().message_set_wire_format and
564 field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
565 not field.is_required and
566 not field.is_repeated):
567 out.write(field.message_type.full_name)
568 else:
569 out.write(field.full_name)
570 out.write(']')
571 elif _IsGroupLike(field):
572 # For groups, use the capitalized name.
573 out.write(field.message_type.name)
574 else:
575 out.write(field.name)
576
577 if (self.force_colon or
578 field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE):
579 # The colon is optional in this case, but our cross-language golden files
580 # don't include it. Here, the colon is only included if force_colon is
581 # set to True
582 out.write(':')
583
584 def PrintField(self, field, value):
585 """Print a single field name/value pair."""
586 self._PrintFieldName(field)
587 self.out.write(' ')
588 self.PrintFieldValue(field, value)
589 self.out.write(' ' if self.as_one_line else '\n')
590
591 def _PrintShortRepeatedPrimitivesValue(self, field, value):
592 """"Prints short repeated primitives value."""
593 # Note: this is called only when value has at least one element.
594 self._PrintFieldName(field)
595 self.out.write(' [')
596 for i in range(len(value) - 1):
597 self.PrintFieldValue(field, value[i])
598 self.out.write(', ')
599 self.PrintFieldValue(field, value[-1])
600 self.out.write(']')
601 self.out.write(' ' if self.as_one_line else '\n')
602
603 def _PrintMessageFieldValue(self, value):
604 if self.pointy_brackets:
605 openb = '<'
606 closeb = '>'
607 else:
608 openb = '{'
609 closeb = '}'
610
611 if self.as_one_line:
612 self.out.write('%s ' % openb)
613 self.PrintMessage(value)
614 self.out.write(closeb)
615 else:
616 self.out.write('%s\n' % openb)
617 self.indent += 2
618 self.PrintMessage(value)
619 self.indent -= 2
620 self.out.write(' ' * self.indent + closeb)
621
622 def PrintFieldValue(self, field, value):
623 """Print a single field value (not including name).
624
625 For repeated fields, the value should be a single element.
626
627 Args:
628 field: The descriptor of the field to be printed.
629 value: The value of the field.
630 """
631 out = self.out
632 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
633 self._PrintMessageFieldValue(value)
634 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM:
635 enum_value = field.enum_type.values_by_number.get(value, None)
636 if enum_value is not None:
637 out.write(enum_value.name)
638 else:
639 out.write(str(value))
640 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING:
641 out.write('\"')
642 if isinstance(value, str) and not self.as_utf8:
643 out_value = value.encode('utf-8')
644 else:
645 out_value = value
646 if field.type == descriptor.FieldDescriptor.TYPE_BYTES:
647 # We always need to escape all binary data in TYPE_BYTES fields.
648 out_as_utf8 = False
649 else:
650 out_as_utf8 = self.as_utf8
651 out.write(text_encoding.CEscape(out_value, out_as_utf8))
652 out.write('\"')
653 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL:
654 if value:
655 out.write('true')
656 else:
657 out.write('false')
658 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_FLOAT:
659 if self.float_format is not None:
660 warnings.warn(
661 'float_format is deprecated for text_format. This '
662 'will turn into error in 7.34.0, please remove it '
663 'before that.'
664 )
665 out.write('{1:{0}}'.format(self.float_format, value))
666 else:
667 if math.isnan(value):
668 out.write(str(value))
669 else:
670 out.write(str(type_checkers.ToShortestFloat(value)))
671 elif (field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_DOUBLE and
672 self.double_format is not None):
673 out.write('{1:{0}}'.format(self.double_format, value))
674 else:
675 out.write(str(value))
676
677
678def Parse(text,
679 message,
680 allow_unknown_extension=False,
681 allow_field_number=False,
682 descriptor_pool=None,
683 allow_unknown_field=False):
684 """Parses a text representation of a protocol message into a message.
685
686 NOTE: for historical reasons this function does not clear the input
687 message. This is different from what the binary msg.ParseFrom(...) does.
688 If text contains a field already set in message, the value is appended if the
689 field is repeated. Otherwise, an error is raised.
690
691 Example::
692
693 a = MyProto()
694 a.repeated_field.append('test')
695 b = MyProto()
696
697 # Repeated fields are combined
698 text_format.Parse(repr(a), b)
699 text_format.Parse(repr(a), b) # repeated_field contains ["test", "test"]
700
701 # Non-repeated fields cannot be overwritten
702 a.singular_field = 1
703 b.singular_field = 2
704 text_format.Parse(repr(a), b) # ParseError
705
706 # Binary version:
707 b.ParseFromString(a.SerializeToString()) # repeated_field is now "test"
708
709 Caller is responsible for clearing the message as needed.
710
711 Args:
712 text (str): Message text representation.
713 message (Message): A protocol buffer message to merge into.
714 allow_unknown_extension: if True, skip over missing extensions and keep
715 parsing
716 allow_field_number: if True, both field number and field name are allowed.
717 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
718 allow_unknown_field: if True, skip over unknown field and keep
719 parsing. Avoid to use this option if possible. It may hide some
720 errors (e.g. spelling error on field name)
721
722 Returns:
723 Message: The same message passed as argument.
724
725 Raises:
726 ParseError: On text parsing problems.
727 """
728 return ParseLines(text.split(b'\n' if isinstance(text, bytes) else u'\n'),
729 message,
730 allow_unknown_extension,
731 allow_field_number,
732 descriptor_pool=descriptor_pool,
733 allow_unknown_field=allow_unknown_field)
734
735
736def Merge(text,
737 message,
738 allow_unknown_extension=False,
739 allow_field_number=False,
740 descriptor_pool=None,
741 allow_unknown_field=False):
742 """Parses a text representation of a protocol message into a message.
743
744 Like Parse(), but allows repeated values for a non-repeated field, and uses
745 the last one. This means any non-repeated, top-level fields specified in text
746 replace those in the message.
747
748 Args:
749 text (str): Message text representation.
750 message (Message): A protocol buffer message to merge into.
751 allow_unknown_extension: if True, skip over missing extensions and keep
752 parsing
753 allow_field_number: if True, both field number and field name are allowed.
754 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
755 allow_unknown_field: if True, skip over unknown field and keep
756 parsing. Avoid to use this option if possible. It may hide some
757 errors (e.g. spelling error on field name)
758
759 Returns:
760 Message: The same message passed as argument.
761
762 Raises:
763 ParseError: On text parsing problems.
764 """
765 return MergeLines(
766 text.split(b'\n' if isinstance(text, bytes) else u'\n'),
767 message,
768 allow_unknown_extension,
769 allow_field_number,
770 descriptor_pool=descriptor_pool,
771 allow_unknown_field=allow_unknown_field)
772
773
774def ParseLines(lines,
775 message,
776 allow_unknown_extension=False,
777 allow_field_number=False,
778 descriptor_pool=None,
779 allow_unknown_field=False):
780 """Parses a text representation of a protocol message into a message.
781
782 See Parse() for caveats.
783
784 Args:
785 lines: An iterable of lines of a message's text representation.
786 message: A protocol buffer message to merge into.
787 allow_unknown_extension: if True, skip over missing extensions and keep
788 parsing
789 allow_field_number: if True, both field number and field name are allowed.
790 descriptor_pool: A DescriptorPool used to resolve Any types.
791 allow_unknown_field: if True, skip over unknown field and keep
792 parsing. Avoid to use this option if possible. It may hide some
793 errors (e.g. spelling error on field name)
794
795 Returns:
796 The same message passed as argument.
797
798 Raises:
799 ParseError: On text parsing problems.
800 """
801 parser = _Parser(allow_unknown_extension,
802 allow_field_number,
803 descriptor_pool=descriptor_pool,
804 allow_unknown_field=allow_unknown_field)
805 return parser.ParseLines(lines, message)
806
807
808def MergeLines(lines,
809 message,
810 allow_unknown_extension=False,
811 allow_field_number=False,
812 descriptor_pool=None,
813 allow_unknown_field=False):
814 """Parses a text representation of a protocol message into a message.
815
816 See Merge() for more details.
817
818 Args:
819 lines: An iterable of lines of a message's text representation.
820 message: A protocol buffer message to merge into.
821 allow_unknown_extension: if True, skip over missing extensions and keep
822 parsing
823 allow_field_number: if True, both field number and field name are allowed.
824 descriptor_pool: A DescriptorPool used to resolve Any types.
825 allow_unknown_field: if True, skip over unknown field and keep
826 parsing. Avoid to use this option if possible. It may hide some
827 errors (e.g. spelling error on field name)
828
829 Returns:
830 The same message passed as argument.
831
832 Raises:
833 ParseError: On text parsing problems.
834 """
835 parser = _Parser(allow_unknown_extension,
836 allow_field_number,
837 descriptor_pool=descriptor_pool,
838 allow_unknown_field=allow_unknown_field)
839 return parser.MergeLines(lines, message)
840
841
842class _Parser(object):
843 """Text format parser for protocol message."""
844
845 def __init__(self,
846 allow_unknown_extension=False,
847 allow_field_number=False,
848 descriptor_pool=None,
849 allow_unknown_field=False):
850 self.allow_unknown_extension = allow_unknown_extension
851 self.allow_field_number = allow_field_number
852 self.descriptor_pool = descriptor_pool
853 self.allow_unknown_field = allow_unknown_field
854
855 def ParseLines(self, lines, message):
856 """Parses a text representation of a protocol message into a message."""
857 self._allow_multiple_scalars = False
858 self._ParseOrMerge(lines, message)
859 return message
860
861 def MergeLines(self, lines, message):
862 """Merges a text representation of a protocol message into a message."""
863 self._allow_multiple_scalars = True
864 self._ParseOrMerge(lines, message)
865 return message
866
867 def _ParseOrMerge(self, lines, message):
868 """Converts a text representation of a protocol message into a message.
869
870 Args:
871 lines: Lines of a message's text representation.
872 message: A protocol buffer message to merge into.
873
874 Raises:
875 ParseError: On text parsing problems.
876 """
877 # Tokenize expects native str lines.
878 try:
879 str_lines = (
880 line if isinstance(line, str) else line.decode('utf-8')
881 for line in lines)
882 tokenizer = Tokenizer(str_lines)
883 except UnicodeDecodeError as e:
884 raise ParseError from e
885 if message:
886 self.root_type = message.DESCRIPTOR.full_name
887 while not tokenizer.AtEnd():
888 self._MergeField(tokenizer, message)
889
890 def _MergeField(self, tokenizer, message):
891 """Merges a single protocol message field into a message.
892
893 Args:
894 tokenizer: A tokenizer to parse the field name and values.
895 message: A protocol message to record the data.
896
897 Raises:
898 ParseError: In case of text parsing problems.
899 """
900 message_descriptor = message.DESCRIPTOR
901 if (message_descriptor.full_name == _ANY_FULL_TYPE_NAME and
902 tokenizer.TryConsume('[')):
903 type_url_prefix, packed_type_name = self._ConsumeAnyTypeUrl(tokenizer)
904 tokenizer.Consume(']')
905 tokenizer.TryConsume(':')
906 self._DetectSilentMarker(tokenizer, message_descriptor.full_name,
907 type_url_prefix + '/' + packed_type_name)
908 if tokenizer.TryConsume('<'):
909 expanded_any_end_token = '>'
910 else:
911 tokenizer.Consume('{')
912 expanded_any_end_token = '}'
913 expanded_any_sub_message = _BuildMessageFromTypeName(packed_type_name,
914 self.descriptor_pool)
915 # Direct comparison with None is used instead of implicit bool conversion
916 # to avoid false positives with falsy initial values, e.g. for
917 # google.protobuf.ListValue.
918 if expanded_any_sub_message is None:
919 raise ParseError('Type %s not found in descriptor pool' %
920 packed_type_name)
921 while not tokenizer.TryConsume(expanded_any_end_token):
922 if tokenizer.AtEnd():
923 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' %
924 (expanded_any_end_token,))
925 self._MergeField(tokenizer, expanded_any_sub_message)
926 deterministic = False
927
928 message.Pack(expanded_any_sub_message,
929 type_url_prefix=type_url_prefix,
930 deterministic=deterministic)
931 return
932
933 if tokenizer.TryConsume('['):
934 name = [tokenizer.ConsumeIdentifier()]
935 while tokenizer.TryConsume('.'):
936 name.append(tokenizer.ConsumeIdentifier())
937 name = '.'.join(name)
938
939 if not message_descriptor.is_extendable:
940 raise tokenizer.ParseErrorPreviousToken(
941 'Message type "%s" does not have extensions.' %
942 message_descriptor.full_name)
943 # pylint: disable=protected-access
944 field = message.Extensions._FindExtensionByName(name)
945 # pylint: enable=protected-access
946 if not field:
947 if self.allow_unknown_extension:
948 field = None
949 else:
950 raise tokenizer.ParseErrorPreviousToken(
951 'Extension "%s" not registered. '
952 'Did you import the _pb2 module which defines it? '
953 'If you are trying to place the extension in the MessageSet '
954 'field of another message that is in an Any or MessageSet field, '
955 'that message\'s _pb2 module must be imported as well' % name)
956 elif message_descriptor != field.containing_type:
957 raise tokenizer.ParseErrorPreviousToken(
958 'Extension "%s" does not extend message type "%s".' %
959 (name, message_descriptor.full_name))
960
961 tokenizer.Consume(']')
962
963 else:
964 name = tokenizer.ConsumeIdentifierOrNumber()
965 if self.allow_field_number and name.isdigit():
966 number = ParseInteger(name, True, True)
967 field = message_descriptor.fields_by_number.get(number, None)
968 if not field and message_descriptor.is_extendable:
969 field = message.Extensions._FindExtensionByNumber(number)
970 else:
971 field = message_descriptor.fields_by_name.get(name, None)
972
973 # Group names are expected to be capitalized as they appear in the
974 # .proto file, which actually matches their type names, not their field
975 # names.
976 if not field:
977 field = message_descriptor.fields_by_name.get(name.lower(), None)
978 if field and not _IsGroupLike(field):
979 field = None
980 if field and field.message_type.name != name:
981 field = None
982
983 if not field and not self.allow_unknown_field:
984 raise tokenizer.ParseErrorPreviousToken(
985 'Message type "%s" has no field named "%s".' %
986 (message_descriptor.full_name, name))
987
988 if field:
989 if not self._allow_multiple_scalars and field.containing_oneof:
990 # Check if there's a different field set in this oneof.
991 # Note that we ignore the case if the same field was set before, and we
992 # apply _allow_multiple_scalars to non-scalar fields as well.
993 which_oneof = message.WhichOneof(field.containing_oneof.name)
994 if which_oneof is not None and which_oneof != field.name:
995 raise tokenizer.ParseErrorPreviousToken(
996 'Field "%s" is specified along with field "%s", another member '
997 'of oneof "%s" for message type "%s".' %
998 (field.name, which_oneof, field.containing_oneof.name,
999 message_descriptor.full_name))
1000
1001 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
1002 tokenizer.TryConsume(':')
1003 self._DetectSilentMarker(tokenizer, message_descriptor.full_name,
1004 field.full_name)
1005 merger = self._MergeMessageField
1006 else:
1007 tokenizer.Consume(':')
1008 self._DetectSilentMarker(tokenizer, message_descriptor.full_name,
1009 field.full_name)
1010 merger = self._MergeScalarField
1011
1012 if (field.is_repeated and
1013 tokenizer.TryConsume('[')):
1014 # Short repeated format, e.g. "foo: [1, 2, 3]"
1015 if not tokenizer.TryConsume(']'):
1016 while True:
1017 merger(tokenizer, message, field)
1018 if tokenizer.TryConsume(']'):
1019 break
1020 tokenizer.Consume(',')
1021
1022 else:
1023 merger(tokenizer, message, field)
1024
1025 else: # Proto field is unknown.
1026 assert (self.allow_unknown_extension or self.allow_unknown_field)
1027 self._SkipFieldContents(tokenizer, name, message_descriptor.full_name)
1028
1029 # For historical reasons, fields may optionally be separated by commas or
1030 # semicolons.
1031 if not tokenizer.TryConsume(','):
1032 tokenizer.TryConsume(';')
1033
1034 def _LogSilentMarker(self, immediate_message_type, field_name):
1035 pass
1036
1037 def _DetectSilentMarker(self, tokenizer, immediate_message_type, field_name):
1038 if tokenizer.contains_silent_marker_before_current_token:
1039 self._LogSilentMarker(immediate_message_type, field_name)
1040
1041 def _ConsumeAnyTypeUrl(self, tokenizer):
1042 """Consumes a google.protobuf.Any type URL and returns the type name."""
1043 # Consume "type.googleapis.com/".
1044 prefix = [tokenizer.ConsumeIdentifier()]
1045 tokenizer.Consume('.')
1046 prefix.append(tokenizer.ConsumeIdentifier())
1047 tokenizer.Consume('.')
1048 prefix.append(tokenizer.ConsumeIdentifier())
1049 tokenizer.Consume('/')
1050 # Consume the fully-qualified type name.
1051 name = [tokenizer.ConsumeIdentifier()]
1052 while tokenizer.TryConsume('.'):
1053 name.append(tokenizer.ConsumeIdentifier())
1054 return '.'.join(prefix), '.'.join(name)
1055
1056 def _MergeMessageField(self, tokenizer, message, field):
1057 """Merges a single scalar field into a message.
1058
1059 Args:
1060 tokenizer: A tokenizer to parse the field value.
1061 message: The message of which field is a member.
1062 field: The descriptor of the field to be merged.
1063
1064 Raises:
1065 ParseError: In case of text parsing problems.
1066 """
1067 is_map_entry = _IsMapEntry(field)
1068
1069 if tokenizer.TryConsume('<'):
1070 end_token = '>'
1071 else:
1072 tokenizer.Consume('{')
1073 end_token = '}'
1074
1075 if field.is_repeated:
1076 if field.is_extension:
1077 sub_message = message.Extensions[field].add()
1078 elif is_map_entry:
1079 sub_message = getattr(message, field.name).GetEntryClass()()
1080 else:
1081 sub_message = getattr(message, field.name).add()
1082 else:
1083 if field.is_extension:
1084 if (not self._allow_multiple_scalars and
1085 message.HasExtension(field)):
1086 raise tokenizer.ParseErrorPreviousToken(
1087 'Message type "%s" should not have multiple "%s" extensions.' %
1088 (message.DESCRIPTOR.full_name, field.full_name))
1089 sub_message = message.Extensions[field]
1090 else:
1091 # Also apply _allow_multiple_scalars to message field.
1092 # TODO: Change to _allow_singular_overwrites.
1093 if (not self._allow_multiple_scalars and
1094 message.HasField(field.name)):
1095 raise tokenizer.ParseErrorPreviousToken(
1096 'Message type "%s" should not have multiple "%s" fields.' %
1097 (message.DESCRIPTOR.full_name, field.name))
1098 sub_message = getattr(message, field.name)
1099 sub_message.SetInParent()
1100
1101 while not tokenizer.TryConsume(end_token):
1102 if tokenizer.AtEnd():
1103 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token,))
1104 self._MergeField(tokenizer, sub_message)
1105
1106 if is_map_entry:
1107 value_cpptype = field.message_type.fields_by_name['value'].cpp_type
1108 if value_cpptype == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
1109 value = getattr(message, field.name)[sub_message.key]
1110 value.CopyFrom(sub_message.value)
1111 else:
1112 getattr(message, field.name)[sub_message.key] = sub_message.value
1113
1114 def _MergeScalarField(self, tokenizer, message, field):
1115 """Merges a single scalar field into a message.
1116
1117 Args:
1118 tokenizer: A tokenizer to parse the field value.
1119 message: A protocol message to record the data.
1120 field: The descriptor of the field to be merged.
1121
1122 Raises:
1123 ParseError: In case of text parsing problems.
1124 RuntimeError: On runtime errors.
1125 """
1126 _ = self.allow_unknown_extension
1127 value = None
1128
1129 if field.type in (descriptor.FieldDescriptor.TYPE_INT32,
1130 descriptor.FieldDescriptor.TYPE_SINT32,
1131 descriptor.FieldDescriptor.TYPE_SFIXED32):
1132 value = _ConsumeInt32(tokenizer)
1133 elif field.type in (descriptor.FieldDescriptor.TYPE_INT64,
1134 descriptor.FieldDescriptor.TYPE_SINT64,
1135 descriptor.FieldDescriptor.TYPE_SFIXED64):
1136 value = _ConsumeInt64(tokenizer)
1137 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32,
1138 descriptor.FieldDescriptor.TYPE_FIXED32):
1139 value = _ConsumeUint32(tokenizer)
1140 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64,
1141 descriptor.FieldDescriptor.TYPE_FIXED64):
1142 value = _ConsumeUint64(tokenizer)
1143 elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT,
1144 descriptor.FieldDescriptor.TYPE_DOUBLE):
1145 value = tokenizer.ConsumeFloat()
1146 elif field.type == descriptor.FieldDescriptor.TYPE_BOOL:
1147 value = tokenizer.ConsumeBool()
1148 elif field.type == descriptor.FieldDescriptor.TYPE_STRING:
1149 value = tokenizer.ConsumeString()
1150 elif field.type == descriptor.FieldDescriptor.TYPE_BYTES:
1151 value = tokenizer.ConsumeByteString()
1152 elif field.type == descriptor.FieldDescriptor.TYPE_ENUM:
1153 value = tokenizer.ConsumeEnum(field)
1154 else:
1155 raise RuntimeError('Unknown field type %d' % field.type)
1156
1157 if field.is_repeated:
1158 if field.is_extension:
1159 message.Extensions[field].append(value)
1160 else:
1161 getattr(message, field.name).append(value)
1162 else:
1163 if field.is_extension:
1164 if (not self._allow_multiple_scalars and
1165 field.has_presence and
1166 message.HasExtension(field)):
1167 raise tokenizer.ParseErrorPreviousToken(
1168 'Message type "%s" should not have multiple "%s" extensions.' %
1169 (message.DESCRIPTOR.full_name, field.full_name))
1170 else:
1171 message.Extensions[field] = value
1172 else:
1173 duplicate_error = False
1174 if not self._allow_multiple_scalars:
1175 if field.has_presence:
1176 duplicate_error = message.HasField(field.name)
1177 else:
1178 # For field that doesn't represent presence, try best effort to
1179 # check multiple scalars by compare to default values.
1180 duplicate_error = not decoder.IsDefaultScalarValue(
1181 getattr(message, field.name)
1182 )
1183
1184 if duplicate_error:
1185 raise tokenizer.ParseErrorPreviousToken(
1186 'Message type "%s" should not have multiple "%s" fields.' %
1187 (message.DESCRIPTOR.full_name, field.name))
1188 else:
1189 setattr(message, field.name, value)
1190
1191 def _SkipFieldContents(self, tokenizer, field_name, immediate_message_type):
1192 """Skips over contents (value or message) of a field.
1193
1194 Args:
1195 tokenizer: A tokenizer to parse the field name and values.
1196 field_name: The field name currently being parsed.
1197 immediate_message_type: The type of the message immediately containing
1198 the silent marker.
1199 """
1200 # Try to guess the type of this field.
1201 # If this field is not a message, there should be a ":" between the
1202 # field name and the field value and also the field value should not
1203 # start with "{" or "<" which indicates the beginning of a message body.
1204 # If there is no ":" or there is a "{" or "<" after ":", this field has
1205 # to be a message or the input is ill-formed.
1206 if tokenizer.TryConsume(
1207 ':') and not tokenizer.LookingAt('{') and not tokenizer.LookingAt('<'):
1208 self._DetectSilentMarker(tokenizer, immediate_message_type, field_name)
1209 if tokenizer.LookingAt('['):
1210 self._SkipRepeatedFieldValue(tokenizer, immediate_message_type)
1211 else:
1212 self._SkipFieldValue(tokenizer)
1213 else:
1214 self._DetectSilentMarker(tokenizer, immediate_message_type, field_name)
1215 self._SkipFieldMessage(tokenizer, immediate_message_type)
1216
1217 def _SkipField(self, tokenizer, immediate_message_type):
1218 """Skips over a complete field (name and value/message).
1219
1220 Args:
1221 tokenizer: A tokenizer to parse the field name and values.
1222 immediate_message_type: The type of the message immediately containing
1223 the silent marker.
1224 """
1225 field_name = ''
1226 if tokenizer.TryConsume('['):
1227 # Consume extension or google.protobuf.Any type URL
1228 field_name += '[' + tokenizer.ConsumeIdentifier()
1229 num_identifiers = 1
1230 while tokenizer.TryConsume('.'):
1231 field_name += '.' + tokenizer.ConsumeIdentifier()
1232 num_identifiers += 1
1233 # This is possibly a type URL for an Any message.
1234 if num_identifiers == 3 and tokenizer.TryConsume('/'):
1235 field_name += '/' + tokenizer.ConsumeIdentifier()
1236 while tokenizer.TryConsume('.'):
1237 field_name += '.' + tokenizer.ConsumeIdentifier()
1238 tokenizer.Consume(']')
1239 field_name += ']'
1240 else:
1241 field_name += tokenizer.ConsumeIdentifierOrNumber()
1242
1243 self._SkipFieldContents(tokenizer, field_name, immediate_message_type)
1244
1245 # For historical reasons, fields may optionally be separated by commas or
1246 # semicolons.
1247 if not tokenizer.TryConsume(','):
1248 tokenizer.TryConsume(';')
1249
1250 def _SkipFieldMessage(self, tokenizer, immediate_message_type):
1251 """Skips over a field message.
1252
1253 Args:
1254 tokenizer: A tokenizer to parse the field name and values.
1255 immediate_message_type: The type of the message immediately containing
1256 the silent marker
1257 """
1258 if tokenizer.TryConsume('<'):
1259 delimiter = '>'
1260 else:
1261 tokenizer.Consume('{')
1262 delimiter = '}'
1263
1264 while not tokenizer.LookingAt('>') and not tokenizer.LookingAt('}'):
1265 self._SkipField(tokenizer, immediate_message_type)
1266
1267 tokenizer.Consume(delimiter)
1268
1269 def _SkipFieldValue(self, tokenizer):
1270 """Skips over a field value.
1271
1272 Args:
1273 tokenizer: A tokenizer to parse the field name and values.
1274
1275 Raises:
1276 ParseError: In case an invalid field value is found.
1277 """
1278 if (not tokenizer.TryConsumeByteString()and
1279 not tokenizer.TryConsumeIdentifier() and
1280 not _TryConsumeInt64(tokenizer) and
1281 not _TryConsumeUint64(tokenizer) and
1282 not tokenizer.TryConsumeFloat()):
1283 raise ParseError('Invalid field value: ' + tokenizer.token)
1284
1285 def _SkipRepeatedFieldValue(self, tokenizer, immediate_message_type):
1286 """Skips over a repeated field value.
1287
1288 Args:
1289 tokenizer: A tokenizer to parse the field value.
1290 """
1291 tokenizer.Consume('[')
1292 if not tokenizer.TryConsume(']'):
1293 while True:
1294 if tokenizer.LookingAt('<') or tokenizer.LookingAt('{'):
1295 self._SkipFieldMessage(tokenizer, immediate_message_type)
1296 else:
1297 self._SkipFieldValue(tokenizer)
1298 if tokenizer.TryConsume(']'):
1299 break
1300 tokenizer.Consume(',')
1301
1302
1303class Tokenizer(object):
1304 """Protocol buffer text representation tokenizer.
1305
1306 This class handles the lower level string parsing by splitting it into
1307 meaningful tokens.
1308
1309 It was directly ported from the Java protocol buffer API.
1310 """
1311
1312 _WHITESPACE = re.compile(r'\s+')
1313 _COMMENT = re.compile(r'(\s*#.*$)', re.MULTILINE)
1314 _WHITESPACE_OR_COMMENT = re.compile(r'(\s|(#.*$))+', re.MULTILINE)
1315 _TOKEN = re.compile('|'.join([
1316 r'[a-zA-Z_][0-9a-zA-Z_+-]*', # an identifier
1317 r'([0-9+-]|(\.[0-9]))[0-9a-zA-Z_.+-]*', # a number
1318 ] + [ # quoted str for each quote mark
1319 # Avoid backtracking! https://stackoverflow.com/a/844267
1320 r'{qt}[^{qt}\n\\]*((\\.)+[^{qt}\n\\]*)*({qt}|\\?$)'.format(qt=mark)
1321 for mark in _QUOTES
1322 ]))
1323
1324 _IDENTIFIER = re.compile(r'[^\d\W]\w*')
1325 _IDENTIFIER_OR_NUMBER = re.compile(r'\w+')
1326
1327 def __init__(self, lines, skip_comments=True):
1328 self._position = 0
1329 self._line = -1
1330 self._column = 0
1331 self._token_start = None
1332 self.token = ''
1333 self._lines = iter(lines)
1334 self._current_line = ''
1335 self._previous_line = 0
1336 self._previous_column = 0
1337 self._more_lines = True
1338 self._skip_comments = skip_comments
1339 self._whitespace_pattern = (skip_comments and self._WHITESPACE_OR_COMMENT
1340 or self._WHITESPACE)
1341 self.contains_silent_marker_before_current_token = False
1342
1343 self._SkipWhitespace()
1344 self.NextToken()
1345
1346 def LookingAt(self, token):
1347 return self.token == token
1348
1349 def AtEnd(self):
1350 """Checks the end of the text was reached.
1351
1352 Returns:
1353 True iff the end was reached.
1354 """
1355 return not self.token
1356
1357 def _PopLine(self):
1358 while len(self._current_line) <= self._column:
1359 try:
1360 self._current_line = next(self._lines)
1361 except StopIteration:
1362 self._current_line = ''
1363 self._more_lines = False
1364 return
1365 else:
1366 self._line += 1
1367 self._column = 0
1368
1369 def _SkipWhitespace(self):
1370 while True:
1371 self._PopLine()
1372 match = self._whitespace_pattern.match(self._current_line, self._column)
1373 if not match:
1374 break
1375 self.contains_silent_marker_before_current_token = match.group(0) == (
1376 ' ' + _DEBUG_STRING_SILENT_MARKER)
1377 length = len(match.group(0))
1378 self._column += length
1379
1380 def TryConsume(self, token):
1381 """Tries to consume a given piece of text.
1382
1383 Args:
1384 token: Text to consume.
1385
1386 Returns:
1387 True iff the text was consumed.
1388 """
1389 if self.token == token:
1390 self.NextToken()
1391 return True
1392 return False
1393
1394 def Consume(self, token):
1395 """Consumes a piece of text.
1396
1397 Args:
1398 token: Text to consume.
1399
1400 Raises:
1401 ParseError: If the text couldn't be consumed.
1402 """
1403 if not self.TryConsume(token):
1404 raise self.ParseError('Expected "%s".' % token)
1405
1406 def ConsumeComment(self):
1407 result = self.token
1408 if not self._COMMENT.match(result):
1409 raise self.ParseError('Expected comment.')
1410 self.NextToken()
1411 return result
1412
1413 def ConsumeCommentOrTrailingComment(self):
1414 """Consumes a comment, returns a 2-tuple (trailing bool, comment str)."""
1415
1416 # Tokenizer initializes _previous_line and _previous_column to 0. As the
1417 # tokenizer starts, it looks like there is a previous token on the line.
1418 just_started = self._line == 0 and self._column == 0
1419
1420 before_parsing = self._previous_line
1421 comment = self.ConsumeComment()
1422
1423 # A trailing comment is a comment on the same line than the previous token.
1424 trailing = (self._previous_line == before_parsing
1425 and not just_started)
1426
1427 return trailing, comment
1428
1429 def TryConsumeIdentifier(self):
1430 try:
1431 self.ConsumeIdentifier()
1432 return True
1433 except ParseError:
1434 return False
1435
1436 def ConsumeIdentifier(self):
1437 """Consumes protocol message field identifier.
1438
1439 Returns:
1440 Identifier string.
1441
1442 Raises:
1443 ParseError: If an identifier couldn't be consumed.
1444 """
1445 result = self.token
1446 if not self._IDENTIFIER.match(result):
1447 raise self.ParseError('Expected identifier.')
1448 self.NextToken()
1449 return result
1450
1451 def TryConsumeIdentifierOrNumber(self):
1452 try:
1453 self.ConsumeIdentifierOrNumber()
1454 return True
1455 except ParseError:
1456 return False
1457
1458 def ConsumeIdentifierOrNumber(self):
1459 """Consumes protocol message field identifier.
1460
1461 Returns:
1462 Identifier string.
1463
1464 Raises:
1465 ParseError: If an identifier couldn't be consumed.
1466 """
1467 result = self.token
1468 if not self._IDENTIFIER_OR_NUMBER.match(result):
1469 raise self.ParseError('Expected identifier or number, got %s.' % result)
1470 self.NextToken()
1471 return result
1472
1473 def TryConsumeInteger(self):
1474 try:
1475 self.ConsumeInteger()
1476 return True
1477 except ParseError:
1478 return False
1479
1480 def ConsumeInteger(self):
1481 """Consumes an integer number.
1482
1483 Returns:
1484 The integer parsed.
1485
1486 Raises:
1487 ParseError: If an integer couldn't be consumed.
1488 """
1489 try:
1490 result = _ParseAbstractInteger(self.token)
1491 except ValueError as e:
1492 raise self.ParseError(str(e))
1493 self.NextToken()
1494 return result
1495
1496 def TryConsumeFloat(self):
1497 try:
1498 self.ConsumeFloat()
1499 return True
1500 except ParseError:
1501 return False
1502
1503 def ConsumeFloat(self):
1504 """Consumes an floating point number.
1505
1506 Returns:
1507 The number parsed.
1508
1509 Raises:
1510 ParseError: If a floating point number couldn't be consumed.
1511 """
1512 try:
1513 result = ParseFloat(self.token)
1514 except ValueError as e:
1515 raise self.ParseError(str(e))
1516 self.NextToken()
1517 return result
1518
1519 def ConsumeBool(self):
1520 """Consumes a boolean value.
1521
1522 Returns:
1523 The bool parsed.
1524
1525 Raises:
1526 ParseError: If a boolean value couldn't be consumed.
1527 """
1528 try:
1529 result = ParseBool(self.token)
1530 except ValueError as e:
1531 raise self.ParseError(str(e))
1532 self.NextToken()
1533 return result
1534
1535 def TryConsumeByteString(self):
1536 try:
1537 self.ConsumeByteString()
1538 return True
1539 except ParseError:
1540 return False
1541
1542 def ConsumeString(self):
1543 """Consumes a string value.
1544
1545 Returns:
1546 The string parsed.
1547
1548 Raises:
1549 ParseError: If a string value couldn't be consumed.
1550 """
1551 the_bytes = self.ConsumeByteString()
1552 try:
1553 return str(the_bytes, 'utf-8')
1554 except UnicodeDecodeError as e:
1555 raise self._StringParseError(e)
1556
1557 def ConsumeByteString(self):
1558 """Consumes a byte array value.
1559
1560 Returns:
1561 The array parsed (as a string).
1562
1563 Raises:
1564 ParseError: If a byte array value couldn't be consumed.
1565 """
1566 the_list = [self._ConsumeSingleByteString()]
1567 while self.token and self.token[0] in _QUOTES:
1568 the_list.append(self._ConsumeSingleByteString())
1569 return b''.join(the_list)
1570
1571 def _ConsumeSingleByteString(self):
1572 """Consume one token of a string literal.
1573
1574 String literals (whether bytes or text) can come in multiple adjacent
1575 tokens which are automatically concatenated, like in C or Python. This
1576 method only consumes one token.
1577
1578 Returns:
1579 The token parsed.
1580 Raises:
1581 ParseError: When the wrong format data is found.
1582 """
1583 text = self.token
1584 if len(text) < 1 or text[0] not in _QUOTES:
1585 raise self.ParseError('Expected string but found: %r' % (text,))
1586
1587 if len(text) < 2 or text[-1] != text[0]:
1588 raise self.ParseError('String missing ending quote: %r' % (text,))
1589
1590 try:
1591 result = text_encoding.CUnescape(text[1:-1])
1592 except ValueError as e:
1593 raise self.ParseError(str(e))
1594 self.NextToken()
1595 return result
1596
1597 def ConsumeEnum(self, field):
1598 try:
1599 result = ParseEnum(field, self.token)
1600 except ValueError as e:
1601 raise self.ParseError(str(e))
1602 self.NextToken()
1603 return result
1604
1605 def ParseErrorPreviousToken(self, message):
1606 """Creates and *returns* a ParseError for the previously read token.
1607
1608 Args:
1609 message: A message to set for the exception.
1610
1611 Returns:
1612 A ParseError instance.
1613 """
1614 return ParseError(message, self._previous_line + 1,
1615 self._previous_column + 1)
1616
1617 def ParseError(self, message):
1618 """Creates and *returns* a ParseError for the current token."""
1619 return ParseError('\'' + self._current_line + '\': ' + message,
1620 self._line + 1, self._column + 1)
1621
1622 def _StringParseError(self, e):
1623 return self.ParseError('Couldn\'t parse string: ' + str(e))
1624
1625 def NextToken(self):
1626 """Reads the next meaningful token."""
1627 self._previous_line = self._line
1628 self._previous_column = self._column
1629 self.contains_silent_marker_before_current_token = False
1630
1631 self._column += len(self.token)
1632 self._SkipWhitespace()
1633
1634 if not self._more_lines:
1635 self.token = ''
1636 return
1637
1638 match = self._TOKEN.match(self._current_line, self._column)
1639 if not match and not self._skip_comments:
1640 match = self._COMMENT.match(self._current_line, self._column)
1641 if match:
1642 token = match.group(0)
1643 self.token = token
1644 else:
1645 self.token = self._current_line[self._column]
1646
1647# Aliased so it can still be accessed by current visibility violators.
1648# TODO: Migrate violators to textformat_tokenizer.
1649_Tokenizer = Tokenizer # pylint: disable=invalid-name
1650
1651
1652def _ConsumeInt32(tokenizer):
1653 """Consumes a signed 32bit integer number from tokenizer.
1654
1655 Args:
1656 tokenizer: A tokenizer used to parse the number.
1657
1658 Returns:
1659 The integer parsed.
1660
1661 Raises:
1662 ParseError: If a signed 32bit integer couldn't be consumed.
1663 """
1664 return _ConsumeInteger(tokenizer, is_signed=True, is_long=False)
1665
1666
1667def _ConsumeUint32(tokenizer):
1668 """Consumes an unsigned 32bit integer number from tokenizer.
1669
1670 Args:
1671 tokenizer: A tokenizer used to parse the number.
1672
1673 Returns:
1674 The integer parsed.
1675
1676 Raises:
1677 ParseError: If an unsigned 32bit integer couldn't be consumed.
1678 """
1679 return _ConsumeInteger(tokenizer, is_signed=False, is_long=False)
1680
1681
1682def _TryConsumeInt64(tokenizer):
1683 try:
1684 _ConsumeInt64(tokenizer)
1685 return True
1686 except ParseError:
1687 return False
1688
1689
1690def _ConsumeInt64(tokenizer):
1691 """Consumes a signed 32bit integer number from tokenizer.
1692
1693 Args:
1694 tokenizer: A tokenizer used to parse the number.
1695
1696 Returns:
1697 The integer parsed.
1698
1699 Raises:
1700 ParseError: If a signed 32bit integer couldn't be consumed.
1701 """
1702 return _ConsumeInteger(tokenizer, is_signed=True, is_long=True)
1703
1704
1705def _TryConsumeUint64(tokenizer):
1706 try:
1707 _ConsumeUint64(tokenizer)
1708 return True
1709 except ParseError:
1710 return False
1711
1712
1713def _ConsumeUint64(tokenizer):
1714 """Consumes an unsigned 64bit integer number from tokenizer.
1715
1716 Args:
1717 tokenizer: A tokenizer used to parse the number.
1718
1719 Returns:
1720 The integer parsed.
1721
1722 Raises:
1723 ParseError: If an unsigned 64bit integer couldn't be consumed.
1724 """
1725 return _ConsumeInteger(tokenizer, is_signed=False, is_long=True)
1726
1727
1728def _ConsumeInteger(tokenizer, is_signed=False, is_long=False):
1729 """Consumes an integer number from tokenizer.
1730
1731 Args:
1732 tokenizer: A tokenizer used to parse the number.
1733 is_signed: True if a signed integer must be parsed.
1734 is_long: True if a long integer must be parsed.
1735
1736 Returns:
1737 The integer parsed.
1738
1739 Raises:
1740 ParseError: If an integer with given characteristics couldn't be consumed.
1741 """
1742 try:
1743 result = ParseInteger(tokenizer.token, is_signed=is_signed, is_long=is_long)
1744 except ValueError as e:
1745 raise tokenizer.ParseError(str(e))
1746 tokenizer.NextToken()
1747 return result
1748
1749
1750def ParseInteger(text, is_signed=False, is_long=False):
1751 """Parses an integer.
1752
1753 Args:
1754 text: The text to parse.
1755 is_signed: True if a signed integer must be parsed.
1756 is_long: True if a long integer must be parsed.
1757
1758 Returns:
1759 The integer value.
1760
1761 Raises:
1762 ValueError: Thrown Iff the text is not a valid integer.
1763 """
1764 # Do the actual parsing. Exception handling is propagated to caller.
1765 result = _ParseAbstractInteger(text)
1766
1767 # Check if the integer is sane. Exceptions handled by callers.
1768 checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)]
1769 checker.CheckValue(result)
1770 return result
1771
1772
1773def _ParseAbstractInteger(text):
1774 """Parses an integer without checking size/signedness.
1775
1776 Args:
1777 text: The text to parse.
1778
1779 Returns:
1780 The integer value.
1781
1782 Raises:
1783 ValueError: Thrown Iff the text is not a valid integer.
1784 """
1785 # Do the actual parsing. Exception handling is propagated to caller.
1786 orig_text = text
1787 c_octal_match = re.match(r'(-?)0(\d+)$', text)
1788 if c_octal_match:
1789 # Python 3 no longer supports 0755 octal syntax without the 'o', so
1790 # we always use the '0o' prefix for multi-digit numbers starting with 0.
1791 text = c_octal_match.group(1) + '0o' + c_octal_match.group(2)
1792 try:
1793 return int(text, 0)
1794 except ValueError:
1795 raise ValueError('Couldn\'t parse integer: %s' % orig_text)
1796
1797
1798def ParseFloat(text):
1799 """Parse a floating point number.
1800
1801 Args:
1802 text: Text to parse.
1803
1804 Returns:
1805 The number parsed.
1806
1807 Raises:
1808 ValueError: If a floating point number couldn't be parsed.
1809 """
1810 if _FLOAT_OCTAL_PREFIX.match(text):
1811 raise ValueError('Invalid octal float: %s' % text)
1812 try:
1813 # Assume Python compatible syntax.
1814 return float(text)
1815 except ValueError:
1816 # Check alternative spellings.
1817 if _FLOAT_INFINITY.match(text):
1818 if text[0] == '-':
1819 return float('-inf')
1820 else:
1821 return float('inf')
1822 elif _FLOAT_NAN.match(text):
1823 return float('nan')
1824 else:
1825 # assume '1.0f' format
1826 try:
1827 return float(text.rstrip('fF'))
1828 except ValueError:
1829 raise ValueError("Couldn't parse float: %s" % text)
1830
1831
1832def ParseBool(text):
1833 """Parse a boolean value.
1834
1835 Args:
1836 text: Text to parse.
1837
1838 Returns:
1839 Boolean values parsed
1840
1841 Raises:
1842 ValueError: If text is not a valid boolean.
1843 """
1844 if text in ('true', 't', '1', 'True'):
1845 return True
1846 elif text in ('false', 'f', '0', 'False'):
1847 return False
1848 else:
1849 raise ValueError('Expected "true" or "false".')
1850
1851
1852def ParseEnum(field, value):
1853 """Parse an enum value.
1854
1855 The value can be specified by a number (the enum value), or by
1856 a string literal (the enum name).
1857
1858 Args:
1859 field: Enum field descriptor.
1860 value: String value.
1861
1862 Returns:
1863 Enum value number.
1864
1865 Raises:
1866 ValueError: If the enum value could not be parsed.
1867 """
1868 enum_descriptor = field.enum_type
1869 try:
1870 number = int(value, 0)
1871 except ValueError:
1872 # Identifier.
1873 enum_value = enum_descriptor.values_by_name.get(value, None)
1874 if enum_value is None:
1875 raise ValueError('Enum type "%s" has no value named %s.' %
1876 (enum_descriptor.full_name, value))
1877 else:
1878 if not field.enum_type.is_closed:
1879 return number
1880 enum_value = enum_descriptor.values_by_number.get(number, None)
1881 if enum_value is None:
1882 raise ValueError('Enum type "%s" has no value with number %d.' %
1883 (enum_descriptor.full_name, number))
1884 return enum_value.number