Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/google/protobuf/text_format.py: 16%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

791 statements  

1# Protocol Buffers - Google's data interchange format 

2# Copyright 2008 Google Inc. All rights reserved. 

3# 

4# Use of this source code is governed by a BSD-style 

5# license that can be found in the LICENSE file or at 

6# https://developers.google.com/open-source/licenses/bsd 

7 

8"""Contains routines for printing protocol messages in text format. 

9 

10Simple usage example:: 

11 

12 # Create a proto object and serialize it to a text proto string. 

13 message = my_proto_pb2.MyMessage(foo='bar') 

14 text_proto = text_format.MessageToString(message) 

15 

16 # Parse a text proto string. 

17 message = text_format.Parse(text_proto, my_proto_pb2.MyMessage()) 

18""" 

19 

20__author__ = 'kenton@google.com (Kenton Varda)' 

21 

22# TODO Import thread contention leads to test failures. 

23import encodings.raw_unicode_escape # pylint: disable=unused-import 

24import encodings.unicode_escape # pylint: disable=unused-import 

25import io 

26import math 

27import re 

28import warnings 

29 

30from google.protobuf.internal import decoder 

31from google.protobuf.internal import type_checkers 

32from google.protobuf import descriptor 

33from google.protobuf import text_encoding 

34from google.protobuf import unknown_fields 

35 

36# pylint: disable=g-import-not-at-top 

37__all__ = ['MessageToString', 'Parse', 'PrintMessage', 'PrintField', 

38 'PrintFieldValue', 'Merge', 'MessageToBytes'] 

39 

40_INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(), 

41 type_checkers.Int32ValueChecker(), 

42 type_checkers.Uint64ValueChecker(), 

43 type_checkers.Int64ValueChecker()) 

44_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?$', re.IGNORECASE) 

45_FLOAT_NAN = re.compile('nanf?$', re.IGNORECASE) 

46_FLOAT_OCTAL_PREFIX = re.compile('-?0[0-9]+') 

47_PERCENT_ENCODING = re.compile(r'^%[\da-fA-F][\da-fA-F]$') 

48_TYPE_NAME = re.compile(r'^[^\d\W]\w*(\.[^\d\W]\w*)*$') 

49_QUOTES = frozenset(("'", '"')) 

50_ANY_FULL_TYPE_NAME = 'google.protobuf.Any' 

51_DEBUG_STRING_SILENT_MARKER = '\t ' 

52 

53_as_utf8_default = True 

54 

55 

56class Error(Exception): 

57 """Top-level module error for text_format.""" 

58 

59 

60class ParseError(Error): 

61 """Thrown in case of text parsing or tokenizing error.""" 

62 

63 def __init__(self, message=None, line=None, column=None): 

64 if message is not None and line is not None: 

65 loc = str(line) 

66 if column is not None: 

67 loc += ':{0}'.format(column) 

68 message = '{0} : {1}'.format(loc, message) 

69 if message is not None: 

70 super(ParseError, self).__init__(message) 

71 else: 

72 super(ParseError, self).__init__() 

73 self._line = line 

74 self._column = column 

75 

76 def GetLine(self): 

77 return self._line 

78 

79 def GetColumn(self): 

80 return self._column 

81 

82 

83class TextWriter(object): 

84 

85 def __init__(self, as_utf8): 

86 self._writer = io.StringIO() 

87 

88 def write(self, val): 

89 return self._writer.write(val) 

90 

91 def close(self): 

92 return self._writer.close() 

93 

94 def getvalue(self): 

95 return self._writer.getvalue() 

96 

97 

98def MessageToString( 

99 message, 

100 as_utf8=_as_utf8_default, 

101 as_one_line=False, 

102 use_short_repeated_primitives=False, 

103 pointy_brackets=False, 

104 use_index_order=False, 

105 use_field_number=False, 

106 descriptor_pool=None, 

107 indent=0, 

108 message_formatter=None, 

109 print_unknown_fields=False, 

110 force_colon=False) -> str: 

111 """Convert protobuf message to text format. 

112 

113 Args: 

114 message: The protocol buffers message. 

115 as_utf8: Return unescaped Unicode for non-ASCII characters. 

116 as_one_line: Don't introduce newlines between fields. 

117 use_short_repeated_primitives: Use short repeated format for primitives. 

118 pointy_brackets: If True, use angle brackets instead of curly braces for 

119 nesting. 

120 use_index_order: If True, fields of a proto message will be printed using 

121 the order defined in source code instead of the field number, extensions 

122 will be printed at the end of the message and their relative order is 

123 determined by the extension number. By default, use the field number 

124 order. 

125 use_field_number: If True, print field numbers instead of names. 

126 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types. 

127 indent (int): The initial indent level, in terms of spaces, for pretty 

128 print. 

129 message_formatter (function(message, indent, as_one_line) -> unicode|None): 

130 Custom formatter for selected sub-messages (usually based on message 

131 type). Use to pretty print parts of the protobuf for easier diffing. 

132 print_unknown_fields: If True, unknown fields will be printed. 

133 force_colon: If set, a colon will be added after the field name even if the 

134 field is a proto message. 

135 

136 Returns: 

137 str: A string of the text formatted protocol buffer message. 

138 """ 

139 out = TextWriter(as_utf8) 

140 printer = _Printer( 

141 out=out, 

142 indent=indent, 

143 as_utf8=as_utf8, 

144 as_one_line=as_one_line, 

145 use_short_repeated_primitives=use_short_repeated_primitives, 

146 pointy_brackets=pointy_brackets, 

147 use_index_order=use_index_order, 

148 use_field_number=use_field_number, 

149 descriptor_pool=descriptor_pool, 

150 message_formatter=message_formatter, 

151 print_unknown_fields=print_unknown_fields, 

152 force_colon=force_colon, 

153 ) 

154 printer.PrintMessage(message) 

155 result = out.getvalue() 

156 out.close() 

157 if as_one_line: 

158 return result.rstrip() 

159 return result 

160 

161 

162def MessageToBytes(message, **kwargs) -> bytes: 

163 """Convert protobuf message to encoded text format. See MessageToString.""" 

164 text = MessageToString(message, **kwargs) 

165 if isinstance(text, bytes): 

166 return text 

167 codec = 'utf-8' if kwargs.get('as_utf8') else 'ascii' 

168 return text.encode(codec) 

169 

170 

171def _IsMapEntry(field): 

172 return (field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and 

173 field.message_type.has_options and 

174 field.message_type.GetOptions().map_entry) 

175 

176 

177def _IsGroupLike(field): 

178 """Determines if a field is consistent with a proto2 group. 

179 

180 Args: 

181 field: The field descriptor. 

182 

183 Returns: 

184 True if this field is group-like, false otherwise. 

185 """ 

186 # Groups are always tag-delimited. 

187 if field.type != descriptor.FieldDescriptor.TYPE_GROUP: 

188 return False 

189 

190 # Group fields always are always the lowercase type name. 

191 if field.name != field.message_type.name.lower(): 

192 return False 

193 

194 if field.message_type.file != field.file: 

195 return False 

196 

197 # Group messages are always defined in the same scope as the field. File 

198 # level extensions will compare NULL == NULL here, which is why the file 

199 # comparison above is necessary to ensure both come from the same file. 

200 return ( 

201 field.message_type.containing_type == field.extension_scope 

202 if field.is_extension 

203 else field.message_type.containing_type == field.containing_type 

204 ) 

205 

206 

207def PrintMessage(message, 

208 out, 

209 indent=0, 

210 as_utf8=_as_utf8_default, 

211 as_one_line=False, 

212 use_short_repeated_primitives=False, 

213 pointy_brackets=False, 

214 use_index_order=False, 

215 use_field_number=False, 

216 descriptor_pool=None, 

217 message_formatter=None, 

218 print_unknown_fields=False, 

219 force_colon=False): 

220 """Convert the message to text format and write it to the out stream. 

221 

222 Args: 

223 message: The Message object to convert to text format. 

224 out: A file handle to write the message to. 

225 indent: The initial indent level for pretty print. 

226 as_utf8: Return unescaped Unicode for non-ASCII characters. 

227 as_one_line: Don't introduce newlines between fields. 

228 use_short_repeated_primitives: Use short repeated format for primitives. 

229 pointy_brackets: If True, use angle brackets instead of curly braces for 

230 nesting. 

231 use_index_order: If True, print fields of a proto message using the order 

232 defined in source code instead of the field number. By default, use the 

233 field number order. 

234 use_field_number: If True, print field numbers instead of names. 

235 descriptor_pool: A DescriptorPool used to resolve Any types. 

236 message_formatter: A function(message, indent, as_one_line): unicode|None 

237 to custom format selected sub-messages (usually based on message type). 

238 Use to pretty print parts of the protobuf for easier diffing. 

239 print_unknown_fields: If True, unknown fields will be printed. 

240 force_colon: If set, a colon will be added after the field name even if 

241 the field is a proto message. 

242 """ 

243 printer = _Printer( 

244 out=out, indent=indent, as_utf8=as_utf8, 

245 as_one_line=as_one_line, 

246 use_short_repeated_primitives=use_short_repeated_primitives, 

247 pointy_brackets=pointy_brackets, 

248 use_index_order=use_index_order, 

249 use_field_number=use_field_number, 

250 descriptor_pool=descriptor_pool, 

251 message_formatter=message_formatter, 

252 print_unknown_fields=print_unknown_fields, 

253 force_colon=force_colon) 

254 printer.PrintMessage(message) 

255 

256 

257def PrintField(field, 

258 value, 

259 out, 

260 indent=0, 

261 as_utf8=_as_utf8_default, 

262 as_one_line=False, 

263 use_short_repeated_primitives=False, 

264 pointy_brackets=False, 

265 use_index_order=False, 

266 message_formatter=None, 

267 print_unknown_fields=False, 

268 force_colon=False): 

269 """Print a single field name/value pair.""" 

270 printer = _Printer( 

271 out, 

272 indent, 

273 as_utf8, 

274 as_one_line, 

275 use_short_repeated_primitives, 

276 pointy_brackets, 

277 use_index_order, 

278 message_formatter=message_formatter, 

279 print_unknown_fields=print_unknown_fields, 

280 force_colon=force_colon, 

281 ) 

282 printer.PrintField(field, value) 

283 

284 

285def PrintFieldValue(field, 

286 value, 

287 out, 

288 indent=0, 

289 as_utf8=_as_utf8_default, 

290 as_one_line=False, 

291 use_short_repeated_primitives=False, 

292 pointy_brackets=False, 

293 use_index_order=False, 

294 message_formatter=None, 

295 print_unknown_fields=False, 

296 force_colon=False): 

297 """Print a single field value (not including name).""" 

298 printer = _Printer( 

299 out, 

300 indent, 

301 as_utf8, 

302 as_one_line, 

303 use_short_repeated_primitives, 

304 pointy_brackets, 

305 use_index_order, 

306 message_formatter=message_formatter, 

307 print_unknown_fields=print_unknown_fields, 

308 force_colon=force_colon, 

309 ) 

310 printer.PrintFieldValue(field, value) 

311 

312 

313def _BuildMessageFromTypeName(type_name, descriptor_pool): 

314 """Returns a protobuf message instance. 

315 

316 Args: 

317 type_name: Fully-qualified protobuf message type name string. 

318 descriptor_pool: DescriptorPool instance. 

319 

320 Returns: 

321 A Message instance of type matching type_name, or None if the a Descriptor 

322 wasn't found matching type_name. 

323 """ 

324 # pylint: disable=g-import-not-at-top 

325 if descriptor_pool is None: 

326 from google.protobuf import descriptor_pool as pool_mod 

327 

328 descriptor_pool = pool_mod.Default() 

329 from google.protobuf import message_factory 

330 

331 try: 

332 message_descriptor = descriptor_pool.FindMessageTypeByName(type_name) 

333 except KeyError: 

334 return None 

335 message_type = message_factory.GetMessageClass(message_descriptor) 

336 return message_type() 

337 

338 

339# These values must match WireType enum in //google/protobuf/wire_format.h. 

340WIRETYPE_LENGTH_DELIMITED = 2 

341WIRETYPE_START_GROUP = 3 

342 

343 

344class _Printer(object): 

345 """Text format printer for protocol message.""" 

346 

347 def __init__( 

348 self, 

349 out, 

350 indent=0, 

351 as_utf8=_as_utf8_default, 

352 as_one_line=False, 

353 use_short_repeated_primitives=False, 

354 pointy_brackets=False, 

355 use_index_order=False, 

356 use_field_number=False, 

357 descriptor_pool=None, 

358 message_formatter=None, 

359 print_unknown_fields=False, 

360 force_colon=False, 

361 ): 

362 """Initialize the Printer. 

363 

364 Args: 

365 out: To record the text format result. 

366 indent: The initial indent level for pretty print. 

367 as_utf8: Return unescaped Unicode for non-ASCII characters. 

368 as_one_line: Don't introduce newlines between fields. 

369 use_short_repeated_primitives: Use short repeated format for primitives. 

370 pointy_brackets: If True, use angle brackets instead of curly braces for 

371 nesting. 

372 use_index_order: If True, print fields of a proto message using the order 

373 defined in source code instead of the field number. By default, use the 

374 field number order. 

375 use_field_number: If True, print field numbers instead of names. 

376 descriptor_pool: A DescriptorPool used to resolve Any types. 

377 message_formatter: A function(message, indent, as_one_line): unicode|None 

378 to custom format selected sub-messages (usually based on message type). 

379 Use to pretty print parts of the protobuf for easier diffing. 

380 print_unknown_fields: If True, unknown fields will be printed. 

381 force_colon: If set, a colon will be added after the field name even if 

382 the field is a proto message. 

383 """ 

384 self.out = out 

385 self.indent = indent 

386 self.as_utf8 = as_utf8 

387 self.as_one_line = as_one_line 

388 self.use_short_repeated_primitives = use_short_repeated_primitives 

389 self.pointy_brackets = pointy_brackets 

390 self.use_index_order = use_index_order 

391 self.use_field_number = use_field_number 

392 self.descriptor_pool = descriptor_pool 

393 self.message_formatter = message_formatter 

394 self.print_unknown_fields = print_unknown_fields 

395 self.force_colon = force_colon 

396 

397 def _TryPrintAsAnyMessage(self, message): 

398 """Serializes if message is a google.protobuf.Any field.""" 

399 if '/' not in message.type_url: 

400 return False 

401 packed_message = _BuildMessageFromTypeName(message.TypeName(), 

402 self.descriptor_pool) 

403 if packed_message is not None: 

404 packed_message.MergeFromString(message.value) 

405 colon = ':' if self.force_colon else '' 

406 self.out.write('%s[%s]%s ' % (self.indent * ' ', message.type_url, colon)) 

407 self._PrintMessageFieldValue(packed_message) 

408 self.out.write(' ' if self.as_one_line else '\n') 

409 return True 

410 else: 

411 return False 

412 

413 def _TryCustomFormatMessage(self, message): 

414 formatted = self.message_formatter(message, self.indent, self.as_one_line) 

415 if formatted is None: 

416 return False 

417 

418 out = self.out 

419 out.write(' ' * self.indent) 

420 out.write(formatted) 

421 out.write(' ' if self.as_one_line else '\n') 

422 return True 

423 

424 def PrintMessage(self, message): 

425 """Convert protobuf message to text format. 

426 

427 Args: 

428 message: The protocol buffers message. 

429 """ 

430 if self.message_formatter and self._TryCustomFormatMessage(message): 

431 return 

432 if (message.DESCRIPTOR.full_name == _ANY_FULL_TYPE_NAME and 

433 self._TryPrintAsAnyMessage(message)): 

434 return 

435 fields = message.ListFields() 

436 if self.use_index_order: 

437 fields.sort( 

438 key=lambda x: x[0].number if x[0].is_extension else x[0].index) 

439 for field, value in fields: 

440 if _IsMapEntry(field): 

441 for key in sorted(value): 

442 # This is slow for maps with submessage entries because it copies the 

443 # entire tree. Unfortunately this would take significant refactoring 

444 # of this file to work around. 

445 # 

446 # TODO: refactor and optimize if this becomes an issue. 

447 entry_submsg = value.GetEntryClass()(key=key, value=value[key]) 

448 self.PrintField(field, entry_submsg) 

449 elif field.is_repeated: 

450 if (self.use_short_repeated_primitives 

451 and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE 

452 and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_STRING): 

453 self._PrintShortRepeatedPrimitivesValue(field, value) 

454 else: 

455 for element in value: 

456 self.PrintField(field, element) 

457 else: 

458 self.PrintField(field, value) 

459 

460 if self.print_unknown_fields: 

461 self._PrintUnknownFields(unknown_fields.UnknownFieldSet(message)) 

462 

463 def _PrintUnknownFields(self, unknown_field_set): 

464 """Print unknown fields.""" 

465 out = self.out 

466 for field in unknown_field_set: 

467 out.write(' ' * self.indent) 

468 out.write(str(field.field_number)) 

469 if field.wire_type == WIRETYPE_START_GROUP: 

470 if self.as_one_line: 

471 out.write(' { ') 

472 else: 

473 out.write(' {\n') 

474 self.indent += 2 

475 

476 self._PrintUnknownFields(field.data) 

477 

478 if self.as_one_line: 

479 out.write('} ') 

480 else: 

481 self.indent -= 2 

482 out.write(' ' * self.indent + '}\n') 

483 elif field.wire_type == WIRETYPE_LENGTH_DELIMITED: 

484 try: 

485 # If this field is parseable as a Message, it is probably 

486 # an embedded message. 

487 # pylint: disable=protected-access 

488 (embedded_unknown_message, pos) = decoder._DecodeUnknownFieldSet( 

489 memoryview(field.data), 0, len(field.data)) 

490 except Exception: # pylint: disable=broad-except 

491 pos = 0 

492 

493 if pos == len(field.data): 

494 if self.as_one_line: 

495 out.write(' { ') 

496 else: 

497 out.write(' {\n') 

498 self.indent += 2 

499 

500 self._PrintUnknownFields(embedded_unknown_message) 

501 

502 if self.as_one_line: 

503 out.write('} ') 

504 else: 

505 self.indent -= 2 

506 out.write(' ' * self.indent + '}\n') 

507 else: 

508 # A string or bytes field. self.as_utf8 may not work. 

509 out.write(': \"') 

510 out.write(text_encoding.CEscape(field.data, False)) 

511 out.write('\" ' if self.as_one_line else '\"\n') 

512 else: 

513 # varint, fixed32, fixed64 

514 out.write(': ') 

515 out.write(str(field.data)) 

516 out.write(' ' if self.as_one_line else '\n') 

517 

518 def _PrintFieldName(self, field): 

519 """Print field name.""" 

520 out = self.out 

521 out.write(' ' * self.indent) 

522 if self.use_field_number: 

523 out.write(str(field.number)) 

524 else: 

525 if field.is_extension: 

526 out.write('[') 

527 if (field.containing_type.GetOptions().message_set_wire_format and 

528 field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and 

529 not field.is_required and 

530 not field.is_repeated): 

531 out.write(field.message_type.full_name) 

532 else: 

533 out.write(field.full_name) 

534 out.write(']') 

535 elif _IsGroupLike(field): 

536 # For groups, use the capitalized name. 

537 out.write(field.message_type.name) 

538 else: 

539 out.write(field.name) 

540 

541 if (self.force_colon or 

542 field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE): 

543 # The colon is optional in this case, but our cross-language golden files 

544 # don't include it. Here, the colon is only included if force_colon is 

545 # set to True 

546 out.write(':') 

547 

548 def PrintField(self, field, value): 

549 """Print a single field name/value pair.""" 

550 self._PrintFieldName(field) 

551 self.out.write(' ') 

552 self.PrintFieldValue(field, value) 

553 self.out.write(' ' if self.as_one_line else '\n') 

554 

555 def _PrintShortRepeatedPrimitivesValue(self, field, value): 

556 """"Prints short repeated primitives value.""" 

557 # Note: this is called only when value has at least one element. 

558 self._PrintFieldName(field) 

559 self.out.write(' [') 

560 for i in range(len(value) - 1): 

561 self.PrintFieldValue(field, value[i]) 

562 self.out.write(', ') 

563 self.PrintFieldValue(field, value[-1]) 

564 self.out.write(']') 

565 self.out.write(' ' if self.as_one_line else '\n') 

566 

567 def _PrintMessageFieldValue(self, value): 

568 if self.pointy_brackets: 

569 openb = '<' 

570 closeb = '>' 

571 else: 

572 openb = '{' 

573 closeb = '}' 

574 

575 if self.as_one_line: 

576 self.out.write('%s ' % openb) 

577 self.PrintMessage(value) 

578 self.out.write(closeb) 

579 else: 

580 self.out.write('%s\n' % openb) 

581 self.indent += 2 

582 self.PrintMessage(value) 

583 self.indent -= 2 

584 self.out.write(' ' * self.indent + closeb) 

585 

586 def PrintFieldValue(self, field, value): 

587 """Print a single field value (not including name). 

588 

589 For repeated fields, the value should be a single element. 

590 

591 Args: 

592 field: The descriptor of the field to be printed. 

593 value: The value of the field. 

594 """ 

595 out = self.out 

596 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 

597 self._PrintMessageFieldValue(value) 

598 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM: 

599 enum_value = field.enum_type.values_by_number.get(value, None) 

600 if enum_value is not None: 

601 out.write(enum_value.name) 

602 else: 

603 out.write(str(value)) 

604 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING: 

605 out.write('\"') 

606 if isinstance(value, str) and not self.as_utf8: 

607 out_value = value.encode('utf-8') 

608 else: 

609 out_value = value 

610 if field.type == descriptor.FieldDescriptor.TYPE_BYTES: 

611 # We always need to escape all binary data in TYPE_BYTES fields. 

612 out_as_utf8 = False 

613 else: 

614 out_as_utf8 = self.as_utf8 

615 out.write(text_encoding.CEscape(out_value, out_as_utf8)) 

616 out.write('\"') 

617 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL: 

618 if value: 

619 out.write('true') 

620 else: 

621 out.write('false') 

622 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_FLOAT: 

623 if math.isnan(value): 

624 out.write(str(value)) 

625 else: 

626 out.write(str(type_checkers.ToShortestFloat(value))) 

627 else: 

628 out.write(str(value)) 

629 

630 

631def Parse(text, 

632 message, 

633 allow_unknown_extension=False, 

634 allow_field_number=False, 

635 descriptor_pool=None, 

636 allow_unknown_field=False): 

637 """Parses a text representation of a protocol message into a message. 

638 

639 NOTE: for historical reasons this function does not clear the input 

640 message. This is different from what the binary msg.ParseFrom(...) does. 

641 If text contains a field already set in message, the value is appended if the 

642 field is repeated. Otherwise, an error is raised. 

643 

644 Example:: 

645 

646 a = MyProto() 

647 a.repeated_field.append('test') 

648 b = MyProto() 

649 

650 # Repeated fields are combined 

651 text_format.Parse(repr(a), b) 

652 text_format.Parse(repr(a), b) # repeated_field contains ["test", "test"] 

653 

654 # Non-repeated fields cannot be overwritten 

655 a.singular_field = 1 

656 b.singular_field = 2 

657 text_format.Parse(repr(a), b) # ParseError 

658 

659 # Binary version: 

660 b.ParseFromString(a.SerializeToString()) # repeated_field is now "test" 

661 

662 Caller is responsible for clearing the message as needed. 

663 

664 Args: 

665 text (str): Message text representation. 

666 message (Message): A protocol buffer message to merge into. 

667 allow_unknown_extension: if True, skip over missing extensions and keep 

668 parsing 

669 allow_field_number: if True, both field number and field name are allowed. 

670 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types. 

671 allow_unknown_field: if True, skip over unknown field and keep 

672 parsing. Avoid to use this option if possible. It may hide some 

673 errors (e.g. spelling error on field name) 

674 

675 Returns: 

676 Message: The same message passed as argument. 

677 

678 Raises: 

679 ParseError: On text parsing problems. 

680 """ 

681 return ParseLines(text.split(b'\n' if isinstance(text, bytes) else u'\n'), 

682 message, 

683 allow_unknown_extension, 

684 allow_field_number, 

685 descriptor_pool=descriptor_pool, 

686 allow_unknown_field=allow_unknown_field) 

687 

688 

689def Merge(text, 

690 message, 

691 allow_unknown_extension=False, 

692 allow_field_number=False, 

693 descriptor_pool=None, 

694 allow_unknown_field=False): 

695 """Parses a text representation of a protocol message into a message. 

696 

697 Like Parse(), but allows repeated values for a non-repeated field, and uses 

698 the last one. This means any non-repeated, top-level fields specified in text 

699 replace those in the message. 

700 

701 Args: 

702 text (str): Message text representation. 

703 message (Message): A protocol buffer message to merge into. 

704 allow_unknown_extension: if True, skip over missing extensions and keep 

705 parsing 

706 allow_field_number: if True, both field number and field name are allowed. 

707 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types. 

708 allow_unknown_field: if True, skip over unknown field and keep 

709 parsing. Avoid to use this option if possible. It may hide some 

710 errors (e.g. spelling error on field name) 

711 

712 Returns: 

713 Message: The same message passed as argument. 

714 

715 Raises: 

716 ParseError: On text parsing problems. 

717 """ 

718 return MergeLines( 

719 text.split(b'\n' if isinstance(text, bytes) else u'\n'), 

720 message, 

721 allow_unknown_extension, 

722 allow_field_number, 

723 descriptor_pool=descriptor_pool, 

724 allow_unknown_field=allow_unknown_field) 

725 

726 

727def ParseLines(lines, 

728 message, 

729 allow_unknown_extension=False, 

730 allow_field_number=False, 

731 descriptor_pool=None, 

732 allow_unknown_field=False): 

733 """Parses a text representation of a protocol message into a message. 

734 

735 See Parse() for caveats. 

736 

737 Args: 

738 lines: An iterable of lines of a message's text representation. 

739 message: A protocol buffer message to merge into. 

740 allow_unknown_extension: if True, skip over missing extensions and keep 

741 parsing 

742 allow_field_number: if True, both field number and field name are allowed. 

743 descriptor_pool: A DescriptorPool used to resolve Any types. 

744 allow_unknown_field: if True, skip over unknown field and keep 

745 parsing. Avoid to use this option if possible. It may hide some 

746 errors (e.g. spelling error on field name) 

747 

748 Returns: 

749 The same message passed as argument. 

750 

751 Raises: 

752 ParseError: On text parsing problems. 

753 """ 

754 parser = _Parser(allow_unknown_extension, 

755 allow_field_number, 

756 descriptor_pool=descriptor_pool, 

757 allow_unknown_field=allow_unknown_field) 

758 return parser.ParseLines(lines, message) 

759 

760 

761def MergeLines(lines, 

762 message, 

763 allow_unknown_extension=False, 

764 allow_field_number=False, 

765 descriptor_pool=None, 

766 allow_unknown_field=False): 

767 """Parses a text representation of a protocol message into a message. 

768 

769 See Merge() for more details. 

770 

771 Args: 

772 lines: An iterable of lines of a message's text representation. 

773 message: A protocol buffer message to merge into. 

774 allow_unknown_extension: if True, skip over missing extensions and keep 

775 parsing 

776 allow_field_number: if True, both field number and field name are allowed. 

777 descriptor_pool: A DescriptorPool used to resolve Any types. 

778 allow_unknown_field: if True, skip over unknown field and keep 

779 parsing. Avoid to use this option if possible. It may hide some 

780 errors (e.g. spelling error on field name) 

781 

782 Returns: 

783 The same message passed as argument. 

784 

785 Raises: 

786 ParseError: On text parsing problems. 

787 """ 

788 parser = _Parser(allow_unknown_extension, 

789 allow_field_number, 

790 descriptor_pool=descriptor_pool, 

791 allow_unknown_field=allow_unknown_field) 

792 return parser.MergeLines(lines, message) 

793 

794 

795class _Parser(object): 

796 """Text format parser for protocol message.""" 

797 

798 def __init__(self, 

799 allow_unknown_extension=False, 

800 allow_field_number=False, 

801 descriptor_pool=None, 

802 allow_unknown_field=False): 

803 self.allow_unknown_extension = allow_unknown_extension 

804 self.allow_field_number = allow_field_number 

805 self.descriptor_pool = descriptor_pool 

806 self.allow_unknown_field = allow_unknown_field 

807 

808 def ParseLines(self, lines, message): 

809 """Parses a text representation of a protocol message into a message.""" 

810 self._allow_multiple_scalars = False 

811 self._ParseOrMerge(lines, message) 

812 return message 

813 

814 def MergeLines(self, lines, message): 

815 """Merges a text representation of a protocol message into a message.""" 

816 self._allow_multiple_scalars = True 

817 self._ParseOrMerge(lines, message) 

818 return message 

819 

820 def _ParseOrMerge(self, lines, message): 

821 """Converts a text representation of a protocol message into a message. 

822 

823 Args: 

824 lines: Lines of a message's text representation. 

825 message: A protocol buffer message to merge into. 

826 

827 Raises: 

828 ParseError: On text parsing problems. 

829 """ 

830 # Tokenize expects native str lines. 

831 try: 

832 str_lines = ( 

833 line if isinstance(line, str) else line.decode('utf-8') 

834 for line in lines) 

835 tokenizer = Tokenizer(str_lines) 

836 except UnicodeDecodeError as e: 

837 raise ParseError from e 

838 if message: 

839 self.root_type = message.DESCRIPTOR.full_name 

840 while not tokenizer.AtEnd(): 

841 self._MergeField(tokenizer, message) 

842 

843 def _MergeField(self, tokenizer, message): 

844 """Merges a single protocol message field into a message. 

845 

846 Args: 

847 tokenizer: A tokenizer to parse the field name and values. 

848 message: A protocol message to record the data. 

849 

850 Raises: 

851 ParseError: In case of text parsing problems. 

852 """ 

853 message_descriptor = message.DESCRIPTOR 

854 if (message_descriptor.full_name == _ANY_FULL_TYPE_NAME and 

855 tokenizer.TryConsume('[')): 

856 type_url_prefix, packed_type_name = self._ConsumeAnyTypeUrl(tokenizer) 

857 tokenizer.TryConsume(':') 

858 self._DetectSilentMarker( 

859 tokenizer, 

860 message_descriptor.full_name, 

861 type_url_prefix + '/' + packed_type_name, 

862 ) 

863 if tokenizer.TryConsume('<'): 

864 expanded_any_end_token = '>' 

865 else: 

866 tokenizer.Consume('{') 

867 expanded_any_end_token = '}' 

868 expanded_any_sub_message = _BuildMessageFromTypeName(packed_type_name, 

869 self.descriptor_pool) 

870 # Direct comparison with None is used instead of implicit bool conversion 

871 # to avoid false positives with falsy initial values, e.g. for 

872 # google.protobuf.ListValue. 

873 if expanded_any_sub_message is None: 

874 raise ParseError('Type %s not found in descriptor pool' % 

875 packed_type_name) 

876 while not tokenizer.TryConsume(expanded_any_end_token): 

877 if tokenizer.AtEnd(): 

878 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % 

879 (expanded_any_end_token,)) 

880 self._MergeField(tokenizer, expanded_any_sub_message) 

881 deterministic = False 

882 

883 message.Pack( 

884 expanded_any_sub_message, 

885 type_url_prefix=type_url_prefix + '/', 

886 deterministic=deterministic, 

887 ) 

888 return 

889 

890 if tokenizer.TryConsume('['): 

891 name = [tokenizer.ConsumeIdentifier()] 

892 while tokenizer.TryConsume('.'): 

893 name.append(tokenizer.ConsumeIdentifier()) 

894 name = '.'.join(name) 

895 

896 if not message_descriptor.is_extendable: 

897 raise tokenizer.ParseErrorPreviousToken( 

898 'Message type "%s" does not have extensions.' % 

899 message_descriptor.full_name) 

900 # pylint: disable=protected-access 

901 field = message.Extensions._FindExtensionByName(name) 

902 # pylint: enable=protected-access 

903 if not field: 

904 if self.allow_unknown_extension: 

905 field = None 

906 else: 

907 raise tokenizer.ParseErrorPreviousToken( 

908 'Extension "%s" not registered. ' 

909 'Did you import the _pb2 module which defines it? ' 

910 'If you are trying to place the extension in the MessageSet ' 

911 'field of another message that is in an Any or MessageSet field, ' 

912 'that message\'s _pb2 module must be imported as well' % name) 

913 elif message_descriptor != field.containing_type: 

914 raise tokenizer.ParseErrorPreviousToken( 

915 'Extension "%s" does not extend message type "%s".' % 

916 (name, message_descriptor.full_name)) 

917 

918 tokenizer.Consume(']') 

919 

920 else: 

921 name = tokenizer.ConsumeIdentifierOrNumber() 

922 if self.allow_field_number and name.isdigit(): 

923 number = ParseInteger(name, True, True) 

924 field = message_descriptor.fields_by_number.get(number, None) 

925 if not field and message_descriptor.is_extendable: 

926 field = message.Extensions._FindExtensionByNumber(number) 

927 else: 

928 field = message_descriptor.fields_by_name.get(name, None) 

929 

930 # Group names are expected to be capitalized as they appear in the 

931 # .proto file, which actually matches their type names, not their field 

932 # names. 

933 if not field: 

934 field = message_descriptor.fields_by_name.get(name.lower(), None) 

935 if field and not _IsGroupLike(field): 

936 field = None 

937 if field and field.message_type.name != name: 

938 field = None 

939 

940 if not field and not self.allow_unknown_field: 

941 raise tokenizer.ParseErrorPreviousToken( 

942 'Message type "%s" has no field named "%s".' % 

943 (message_descriptor.full_name, name)) 

944 

945 if field: 

946 if not self._allow_multiple_scalars and field.containing_oneof: 

947 # Check if there's a different field set in this oneof. 

948 # Note that we ignore the case if the same field was set before, and we 

949 # apply _allow_multiple_scalars to non-scalar fields as well. 

950 which_oneof = message.WhichOneof(field.containing_oneof.name) 

951 if which_oneof is not None and which_oneof != field.name: 

952 raise tokenizer.ParseErrorPreviousToken( 

953 'Field "%s" is specified along with field "%s", another member ' 

954 'of oneof "%s" for message type "%s".' % 

955 (field.name, which_oneof, field.containing_oneof.name, 

956 message_descriptor.full_name)) 

957 

958 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 

959 tokenizer.TryConsume(':') 

960 self._DetectSilentMarker(tokenizer, message_descriptor.full_name, 

961 field.full_name) 

962 merger = self._MergeMessageField 

963 else: 

964 tokenizer.Consume(':') 

965 self._DetectSilentMarker(tokenizer, message_descriptor.full_name, 

966 field.full_name) 

967 merger = self._MergeScalarField 

968 

969 if (field.is_repeated and 

970 tokenizer.TryConsume('[')): 

971 # Short repeated format, e.g. "foo: [1, 2, 3]" 

972 if not tokenizer.TryConsume(']'): 

973 while True: 

974 merger(tokenizer, message, field) 

975 if tokenizer.TryConsume(']'): 

976 break 

977 tokenizer.Consume(',') 

978 

979 else: 

980 merger(tokenizer, message, field) 

981 

982 else: # Proto field is unknown. 

983 assert (self.allow_unknown_extension or self.allow_unknown_field) 

984 self._SkipFieldContents(tokenizer, name, message_descriptor.full_name) 

985 

986 # For historical reasons, fields may optionally be separated by commas or 

987 # semicolons. 

988 if not tokenizer.TryConsume(','): 

989 tokenizer.TryConsume(';') 

990 

991 def _LogSilentMarker(self, immediate_message_type, field_name): 

992 pass 

993 

994 def _DetectSilentMarker(self, tokenizer, immediate_message_type, field_name): 

995 if tokenizer.contains_silent_marker_before_current_token: 

996 self._LogSilentMarker(immediate_message_type, field_name) 

997 

998 def _ConsumeAnyTypeUrl(self, tokenizer): 

999 """Consumes a google.protobuf.Any type URL. 

1000 

1001 Assumes the caller has already consumed the opening [ and consumes up to the 

1002 closing ]. 

1003 

1004 Args: 

1005 tokenizer: A tokenizer to parse the type URL. 

1006 

1007 Returns: 

1008 A tuple of type URL prefix (without trailing slash) and type name. 

1009 """ 

1010 # Consume all tokens with valid URL characters until ]. Whitespace and 

1011 # comments are ignored/skipped by the Tokenizer. 

1012 tokens = [] 

1013 last_slash = -1 

1014 while True: 

1015 try: 

1016 tokens.append(tokenizer.ConsumeUrlChars()) 

1017 continue 

1018 except ParseError: 

1019 pass 

1020 if tokenizer.TryConsume('/'): 

1021 last_slash = len(tokens) 

1022 tokens.append('/') 

1023 else: 

1024 tokenizer.Consume(']') 

1025 break 

1026 

1027 if last_slash == -1: 

1028 raise tokenizer.ParseError('Type URL does not contain "/".') 

1029 

1030 prefix = ''.join(tokens[:last_slash]) 

1031 name = ''.join(tokens[last_slash + 1 :]) 

1032 

1033 if not prefix: 

1034 raise tokenizer.ParseError('Type URL prefix is empty.') 

1035 if prefix.startswith('/'): 

1036 raise tokenizer.ParseError('Type URL prefix starts with "/".') 

1037 

1038 # Check for invalid percent encodings. '%' needs to be followed by exactly 

1039 # two valid hexadecimal digits. 

1040 for i, char in enumerate(prefix): 

1041 if char == '%' and not _PERCENT_ENCODING.match(prefix[i : i + 3]): 

1042 raise tokenizer.ParseError( 

1043 f'Invalid percent escape, got "{prefix[i : i + 3]}".' 

1044 ) 

1045 

1046 # After the last slash we expect a valid type name, not just any sequence of 

1047 # URL characters. 

1048 if not _TYPE_NAME.match(name): 

1049 raise tokenizer.ParseError('Expected type name, got "%s".' % name) 

1050 

1051 return prefix, name 

1052 

1053 def _MergeMessageField(self, tokenizer, message, field): 

1054 """Merges a single scalar field into a message. 

1055 

1056 Args: 

1057 tokenizer: A tokenizer to parse the field value. 

1058 message: The message of which field is a member. 

1059 field: The descriptor of the field to be merged. 

1060 

1061 Raises: 

1062 ParseError: In case of text parsing problems. 

1063 """ 

1064 is_map_entry = _IsMapEntry(field) 

1065 

1066 if tokenizer.TryConsume('<'): 

1067 end_token = '>' 

1068 else: 

1069 tokenizer.Consume('{') 

1070 end_token = '}' 

1071 

1072 if field.is_repeated: 

1073 if field.is_extension: 

1074 sub_message = message.Extensions[field].add() 

1075 elif is_map_entry: 

1076 sub_message = getattr(message, field.name).GetEntryClass()() 

1077 else: 

1078 sub_message = getattr(message, field.name).add() 

1079 else: 

1080 if field.is_extension: 

1081 if (not self._allow_multiple_scalars and 

1082 message.HasExtension(field)): 

1083 raise tokenizer.ParseErrorPreviousToken( 

1084 'Message type "%s" should not have multiple "%s" extensions.' % 

1085 (message.DESCRIPTOR.full_name, field.full_name)) 

1086 sub_message = message.Extensions[field] 

1087 else: 

1088 # Also apply _allow_multiple_scalars to message field. 

1089 # TODO: Change to _allow_singular_overwrites. 

1090 if (not self._allow_multiple_scalars and 

1091 message.HasField(field.name)): 

1092 raise tokenizer.ParseErrorPreviousToken( 

1093 'Message type "%s" should not have multiple "%s" fields.' % 

1094 (message.DESCRIPTOR.full_name, field.name)) 

1095 sub_message = getattr(message, field.name) 

1096 sub_message.SetInParent() 

1097 

1098 while not tokenizer.TryConsume(end_token): 

1099 if tokenizer.AtEnd(): 

1100 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token,)) 

1101 self._MergeField(tokenizer, sub_message) 

1102 

1103 if is_map_entry: 

1104 value_cpptype = field.message_type.fields_by_name['value'].cpp_type 

1105 if value_cpptype == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 

1106 value = getattr(message, field.name)[sub_message.key] 

1107 value.CopyFrom(sub_message.value) 

1108 else: 

1109 getattr(message, field.name)[sub_message.key] = sub_message.value 

1110 

1111 def _MergeScalarField(self, tokenizer, message, field): 

1112 """Merges a single scalar field into a message. 

1113 

1114 Args: 

1115 tokenizer: A tokenizer to parse the field value. 

1116 message: A protocol message to record the data. 

1117 field: The descriptor of the field to be merged. 

1118 

1119 Raises: 

1120 ParseError: In case of text parsing problems. 

1121 RuntimeError: On runtime errors. 

1122 """ 

1123 _ = self.allow_unknown_extension 

1124 value = None 

1125 

1126 if field.type in (descriptor.FieldDescriptor.TYPE_INT32, 

1127 descriptor.FieldDescriptor.TYPE_SINT32, 

1128 descriptor.FieldDescriptor.TYPE_SFIXED32): 

1129 value = _ConsumeInt32(tokenizer) 

1130 elif field.type in (descriptor.FieldDescriptor.TYPE_INT64, 

1131 descriptor.FieldDescriptor.TYPE_SINT64, 

1132 descriptor.FieldDescriptor.TYPE_SFIXED64): 

1133 value = _ConsumeInt64(tokenizer) 

1134 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32, 

1135 descriptor.FieldDescriptor.TYPE_FIXED32): 

1136 value = _ConsumeUint32(tokenizer) 

1137 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64, 

1138 descriptor.FieldDescriptor.TYPE_FIXED64): 

1139 value = _ConsumeUint64(tokenizer) 

1140 elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT, 

1141 descriptor.FieldDescriptor.TYPE_DOUBLE): 

1142 value = tokenizer.ConsumeFloat() 

1143 elif field.type == descriptor.FieldDescriptor.TYPE_BOOL: 

1144 value = tokenizer.ConsumeBool() 

1145 elif field.type == descriptor.FieldDescriptor.TYPE_STRING: 

1146 value = tokenizer.ConsumeString() 

1147 elif field.type == descriptor.FieldDescriptor.TYPE_BYTES: 

1148 value = tokenizer.ConsumeByteString() 

1149 elif field.type == descriptor.FieldDescriptor.TYPE_ENUM: 

1150 value = tokenizer.ConsumeEnum(field) 

1151 else: 

1152 raise RuntimeError('Unknown field type %d' % field.type) 

1153 

1154 if field.is_repeated: 

1155 if field.is_extension: 

1156 message.Extensions[field].append(value) 

1157 else: 

1158 getattr(message, field.name).append(value) 

1159 else: 

1160 if field.is_extension: 

1161 if (not self._allow_multiple_scalars and 

1162 field.has_presence and 

1163 message.HasExtension(field)): 

1164 raise tokenizer.ParseErrorPreviousToken( 

1165 'Message type "%s" should not have multiple "%s" extensions.' % 

1166 (message.DESCRIPTOR.full_name, field.full_name)) 

1167 else: 

1168 message.Extensions[field] = value 

1169 else: 

1170 duplicate_error = False 

1171 if not self._allow_multiple_scalars: 

1172 if field.has_presence: 

1173 duplicate_error = message.HasField(field.name) 

1174 else: 

1175 # For field that doesn't represent presence, try best effort to 

1176 # check multiple scalars by compare to default values. 

1177 duplicate_error = not decoder.IsDefaultScalarValue( 

1178 getattr(message, field.name) 

1179 ) 

1180 

1181 if duplicate_error: 

1182 raise tokenizer.ParseErrorPreviousToken( 

1183 'Message type "%s" should not have multiple "%s" fields.' % 

1184 (message.DESCRIPTOR.full_name, field.name)) 

1185 else: 

1186 setattr(message, field.name, value) 

1187 

1188 def _SkipFieldContents(self, tokenizer, field_name, immediate_message_type): 

1189 """Skips over contents (value or message) of a field. 

1190 

1191 Args: 

1192 tokenizer: A tokenizer to parse the field name and values. 

1193 field_name: The field name currently being parsed. 

1194 immediate_message_type: The type of the message immediately containing 

1195 the silent marker. 

1196 """ 

1197 # Try to guess the type of this field. 

1198 # If this field is not a message, there should be a ":" between the 

1199 # field name and the field value and also the field value should not 

1200 # start with "{" or "<" which indicates the beginning of a message body. 

1201 # If there is no ":" or there is a "{" or "<" after ":", this field has 

1202 # to be a message or the input is ill-formed. 

1203 if tokenizer.TryConsume( 

1204 ':') and not tokenizer.LookingAt('{') and not tokenizer.LookingAt('<'): 

1205 self._DetectSilentMarker(tokenizer, immediate_message_type, field_name) 

1206 if tokenizer.LookingAt('['): 

1207 self._SkipRepeatedFieldValue(tokenizer, immediate_message_type) 

1208 else: 

1209 self._SkipFieldValue(tokenizer) 

1210 else: 

1211 self._DetectSilentMarker(tokenizer, immediate_message_type, field_name) 

1212 self._SkipFieldMessage(tokenizer, immediate_message_type) 

1213 

1214 def _SkipField(self, tokenizer, immediate_message_type): 

1215 """Skips over a complete field (name and value/message). 

1216 

1217 Args: 

1218 tokenizer: A tokenizer to parse the field name and values. 

1219 immediate_message_type: The type of the message immediately containing 

1220 the silent marker. 

1221 """ 

1222 field_name = '' 

1223 if tokenizer.TryConsume('['): 

1224 # Consume extension or google.protobuf.Any type URL 

1225 field_name += '[' + tokenizer.ConsumeIdentifier() 

1226 num_identifiers = 1 

1227 while tokenizer.TryConsume('.'): 

1228 field_name += '.' + tokenizer.ConsumeIdentifier() 

1229 num_identifiers += 1 

1230 # This is possibly a type URL for an Any message. 

1231 if num_identifiers == 3 and tokenizer.TryConsume('/'): 

1232 field_name += '/' + tokenizer.ConsumeIdentifier() 

1233 while tokenizer.TryConsume('.'): 

1234 field_name += '.' + tokenizer.ConsumeIdentifier() 

1235 tokenizer.Consume(']') 

1236 field_name += ']' 

1237 else: 

1238 field_name += tokenizer.ConsumeIdentifierOrNumber() 

1239 

1240 self._SkipFieldContents(tokenizer, field_name, immediate_message_type) 

1241 

1242 # For historical reasons, fields may optionally be separated by commas or 

1243 # semicolons. 

1244 if not tokenizer.TryConsume(','): 

1245 tokenizer.TryConsume(';') 

1246 

1247 def _SkipFieldMessage(self, tokenizer, immediate_message_type): 

1248 """Skips over a field message. 

1249 

1250 Args: 

1251 tokenizer: A tokenizer to parse the field name and values. 

1252 immediate_message_type: The type of the message immediately containing 

1253 the silent marker 

1254 """ 

1255 if tokenizer.TryConsume('<'): 

1256 delimiter = '>' 

1257 else: 

1258 tokenizer.Consume('{') 

1259 delimiter = '}' 

1260 

1261 while not tokenizer.LookingAt('>') and not tokenizer.LookingAt('}'): 

1262 self._SkipField(tokenizer, immediate_message_type) 

1263 

1264 tokenizer.Consume(delimiter) 

1265 

1266 def _SkipFieldValue(self, tokenizer): 

1267 """Skips over a field value. 

1268 

1269 Args: 

1270 tokenizer: A tokenizer to parse the field name and values. 

1271 

1272 Raises: 

1273 ParseError: In case an invalid field value is found. 

1274 """ 

1275 if (not tokenizer.TryConsumeByteString()and 

1276 not tokenizer.TryConsumeIdentifier() and 

1277 not _TryConsumeInt64(tokenizer) and 

1278 not _TryConsumeUint64(tokenizer) and 

1279 not tokenizer.TryConsumeFloat()): 

1280 raise ParseError('Invalid field value: ' + tokenizer.token) 

1281 

1282 def _SkipRepeatedFieldValue(self, tokenizer, immediate_message_type): 

1283 """Skips over a repeated field value. 

1284 

1285 Args: 

1286 tokenizer: A tokenizer to parse the field value. 

1287 """ 

1288 tokenizer.Consume('[') 

1289 if not tokenizer.TryConsume(']'): 

1290 while True: 

1291 if tokenizer.LookingAt('<') or tokenizer.LookingAt('{'): 

1292 self._SkipFieldMessage(tokenizer, immediate_message_type) 

1293 else: 

1294 self._SkipFieldValue(tokenizer) 

1295 if tokenizer.TryConsume(']'): 

1296 break 

1297 tokenizer.Consume(',') 

1298 

1299 

1300class Tokenizer(object): 

1301 """Protocol buffer text representation tokenizer. 

1302 

1303 This class handles the lower level string parsing by splitting it into 

1304 meaningful tokens. 

1305 

1306 It was directly ported from the Java protocol buffer API. 

1307 """ 

1308 

1309 _WHITESPACE = re.compile(r'\s+') 

1310 _COMMENT = re.compile(r'(\s*#.*$)', re.MULTILINE) 

1311 _WHITESPACE_OR_COMMENT = re.compile(r'(\s|(#.*$))+', re.MULTILINE) 

1312 _TOKEN = re.compile( 

1313 '|'.join( 

1314 [ 

1315 r'[a-zA-Z_][0-9a-zA-Z_+-]*', # an identifier 

1316 r'([0-9+-]|(\.[0-9]))[0-9a-zA-Z_.+-]*', # a number 

1317 ] 

1318 + [ # quoted str for each quote mark 

1319 # Avoid backtracking! https://stackoverflow.com/a/844267 

1320 r'{qt}[^{qt}\n\\]*((\\.)+[^{qt}\n\\]*)*({qt}|\\?$)'.format( 

1321 qt=mark 

1322 ) 

1323 for mark in _QUOTES 

1324 ] 

1325 ) 

1326 ) 

1327 

1328 _IDENTIFIER = re.compile(r'[^\d\W]\w*') 

1329 _IDENTIFIER_OR_NUMBER = re.compile(r'\w+') 

1330 # Accepted URL characters (excluding "/") 

1331 _URL_CHARS = re.compile(r'^[0-9a-zA-Z-.~_ !$&()*+,;=%]+$') 

1332 

1333 def __init__(self, lines, skip_comments=True): 

1334 self._position = 0 

1335 self._line = -1 

1336 self._column = 0 

1337 self._token_start = None 

1338 self.token = '' 

1339 self._lines = iter(lines) 

1340 self._current_line = '' 

1341 self._previous_line = 0 

1342 self._previous_column = 0 

1343 self._more_lines = True 

1344 self._skip_comments = skip_comments 

1345 self._whitespace_pattern = (skip_comments and self._WHITESPACE_OR_COMMENT 

1346 or self._WHITESPACE) 

1347 self.contains_silent_marker_before_current_token = False 

1348 

1349 self._SkipWhitespace() 

1350 self.NextToken() 

1351 

1352 def LookingAt(self, token): 

1353 return self.token == token 

1354 

1355 def AtEnd(self): 

1356 """Checks the end of the text was reached. 

1357 

1358 Returns: 

1359 True iff the end was reached. 

1360 """ 

1361 return not self.token 

1362 

1363 def _PopLine(self): 

1364 while len(self._current_line) <= self._column: 

1365 try: 

1366 self._current_line = next(self._lines) 

1367 except StopIteration: 

1368 self._current_line = '' 

1369 self._more_lines = False 

1370 return 

1371 else: 

1372 self._line += 1 

1373 self._column = 0 

1374 

1375 def _SkipWhitespace(self): 

1376 while True: 

1377 self._PopLine() 

1378 match = self._whitespace_pattern.match(self._current_line, self._column) 

1379 if not match: 

1380 break 

1381 self.contains_silent_marker_before_current_token = match.group(0) == ( 

1382 ' ' + _DEBUG_STRING_SILENT_MARKER) 

1383 length = len(match.group(0)) 

1384 self._column += length 

1385 

1386 def TryConsume(self, token): 

1387 """Tries to consume a given piece of text. 

1388 

1389 Args: 

1390 token: Text to consume. 

1391 

1392 Returns: 

1393 True iff the text was consumed. 

1394 """ 

1395 if self.token == token: 

1396 self.NextToken() 

1397 return True 

1398 return False 

1399 

1400 def Consume(self, token): 

1401 """Consumes a piece of text. 

1402 

1403 Args: 

1404 token: Text to consume. 

1405 

1406 Raises: 

1407 ParseError: If the text couldn't be consumed. 

1408 """ 

1409 if not self.TryConsume(token): 

1410 raise self.ParseError('Expected "%s".' % token) 

1411 

1412 def ConsumeComment(self): 

1413 result = self.token 

1414 if not self._COMMENT.match(result): 

1415 raise self.ParseError('Expected comment.') 

1416 self.NextToken() 

1417 return result 

1418 

1419 def ConsumeCommentOrTrailingComment(self): 

1420 """Consumes a comment, returns a 2-tuple (trailing bool, comment str).""" 

1421 

1422 # Tokenizer initializes _previous_line and _previous_column to 0. As the 

1423 # tokenizer starts, it looks like there is a previous token on the line. 

1424 just_started = self._line == 0 and self._column == 0 

1425 

1426 before_parsing = self._previous_line 

1427 comment = self.ConsumeComment() 

1428 

1429 # A trailing comment is a comment on the same line than the previous token. 

1430 trailing = (self._previous_line == before_parsing 

1431 and not just_started) 

1432 

1433 return trailing, comment 

1434 

1435 def TryConsumeIdentifier(self): 

1436 try: 

1437 self.ConsumeIdentifier() 

1438 return True 

1439 except ParseError: 

1440 return False 

1441 

1442 def ConsumeIdentifier(self): 

1443 """Consumes protocol message field identifier. 

1444 

1445 Returns: 

1446 Identifier string. 

1447 

1448 Raises: 

1449 ParseError: If an identifier couldn't be consumed. 

1450 """ 

1451 result = self.token 

1452 if not self._IDENTIFIER.match(result): 

1453 raise self.ParseError('Expected identifier.') 

1454 self.NextToken() 

1455 return result 

1456 

1457 def TryConsumeIdentifierOrNumber(self): 

1458 try: 

1459 self.ConsumeIdentifierOrNumber() 

1460 return True 

1461 except ParseError: 

1462 return False 

1463 

1464 def ConsumeIdentifierOrNumber(self): 

1465 """Consumes protocol message field identifier. 

1466 

1467 Returns: 

1468 Identifier string. 

1469 

1470 Raises: 

1471 ParseError: If an identifier couldn't be consumed. 

1472 """ 

1473 result = self.token 

1474 if not self._IDENTIFIER_OR_NUMBER.match(result): 

1475 raise self.ParseError('Expected identifier or number, got %s.' % result) 

1476 self.NextToken() 

1477 return result 

1478 

1479 def TryConsumeInteger(self): 

1480 try: 

1481 self.ConsumeInteger() 

1482 return True 

1483 except ParseError: 

1484 return False 

1485 

1486 def ConsumeInteger(self): 

1487 """Consumes an integer number. 

1488 

1489 Returns: 

1490 The integer parsed. 

1491 

1492 Raises: 

1493 ParseError: If an integer couldn't be consumed. 

1494 """ 

1495 try: 

1496 result = _ParseAbstractInteger(self.token) 

1497 except ValueError as e: 

1498 raise self.ParseError(str(e)) 

1499 self.NextToken() 

1500 return result 

1501 

1502 def TryConsumeFloat(self): 

1503 try: 

1504 self.ConsumeFloat() 

1505 return True 

1506 except ParseError: 

1507 return False 

1508 

1509 def ConsumeFloat(self): 

1510 """Consumes an floating point number. 

1511 

1512 Returns: 

1513 The number parsed. 

1514 

1515 Raises: 

1516 ParseError: If a floating point number couldn't be consumed. 

1517 """ 

1518 try: 

1519 result = ParseFloat(self.token) 

1520 except ValueError as e: 

1521 raise self.ParseError(str(e)) 

1522 self.NextToken() 

1523 return result 

1524 

1525 def ConsumeBool(self): 

1526 """Consumes a boolean value. 

1527 

1528 Returns: 

1529 The bool parsed. 

1530 

1531 Raises: 

1532 ParseError: If a boolean value couldn't be consumed. 

1533 """ 

1534 try: 

1535 result = ParseBool(self.token) 

1536 except ValueError as e: 

1537 raise self.ParseError(str(e)) 

1538 self.NextToken() 

1539 return result 

1540 

1541 def TryConsumeByteString(self): 

1542 try: 

1543 self.ConsumeByteString() 

1544 return True 

1545 except ParseError: 

1546 return False 

1547 

1548 def ConsumeString(self): 

1549 """Consumes a string value. 

1550 

1551 Returns: 

1552 The string parsed. 

1553 

1554 Raises: 

1555 ParseError: If a string value couldn't be consumed. 

1556 """ 

1557 the_bytes = self.ConsumeByteString() 

1558 try: 

1559 return str(the_bytes, 'utf-8') 

1560 except UnicodeDecodeError as e: 

1561 raise self._StringParseError(e) 

1562 

1563 def ConsumeByteString(self): 

1564 """Consumes a byte array value. 

1565 

1566 Returns: 

1567 The array parsed (as a string). 

1568 

1569 Raises: 

1570 ParseError: If a byte array value couldn't be consumed. 

1571 """ 

1572 the_list = [self._ConsumeSingleByteString()] 

1573 while self.token and self.token[0] in _QUOTES: 

1574 the_list.append(self._ConsumeSingleByteString()) 

1575 return b''.join(the_list) 

1576 

1577 def _ConsumeSingleByteString(self): 

1578 """Consume one token of a string literal. 

1579 

1580 String literals (whether bytes or text) can come in multiple adjacent 

1581 tokens which are automatically concatenated, like in C or Python. This 

1582 method only consumes one token. 

1583 

1584 Returns: 

1585 The token parsed. 

1586 Raises: 

1587 ParseError: When the wrong format data is found. 

1588 """ 

1589 text = self.token 

1590 if len(text) < 1 or text[0] not in _QUOTES: 

1591 raise self.ParseError('Expected string but found: %r' % (text,)) 

1592 

1593 if len(text) < 2 or text[-1] != text[0]: 

1594 raise self.ParseError('String missing ending quote: %r' % (text,)) 

1595 

1596 try: 

1597 result = text_encoding.CUnescape(text[1:-1]) 

1598 except ValueError as e: 

1599 raise self.ParseError(str(e)) 

1600 self.NextToken() 

1601 return result 

1602 

1603 def ConsumeEnum(self, field): 

1604 try: 

1605 result = ParseEnum(field, self.token) 

1606 except ValueError as e: 

1607 raise self.ParseError(str(e)) 

1608 self.NextToken() 

1609 return result 

1610 

1611 def ConsumeUrlChars(self): 

1612 """Consumes a token containing valid URL characters. 

1613 

1614 Excludes '/' so that it can be treated specially as a delimiter. 

1615 

1616 Returns: 

1617 The next token containing one or more URL characters. 

1618 

1619 Raises: 

1620 ParseError: If the next token contains unaccepted URL characters. 

1621 """ 

1622 if not self._URL_CHARS.match(self.token): 

1623 raise self.ParseError('Expected URL character(s), got "%s"' % self.token) 

1624 

1625 result = self.token 

1626 self.NextToken() 

1627 return result 

1628 

1629 def TryConsumeUrlChars(self): 

1630 try: 

1631 self.ConsumeUrlChars() 

1632 return True 

1633 except ParseError: 

1634 return False 

1635 

1636 def ParseErrorPreviousToken(self, message): 

1637 """Creates and *returns* a ParseError for the previously read token. 

1638 

1639 Args: 

1640 message: A message to set for the exception. 

1641 

1642 Returns: 

1643 A ParseError instance. 

1644 """ 

1645 return ParseError(message, self._previous_line + 1, 

1646 self._previous_column + 1) 

1647 

1648 def ParseError(self, message): 

1649 """Creates and *returns* a ParseError for the current token.""" 

1650 return ParseError('\'' + self._current_line + '\': ' + message, 

1651 self._line + 1, self._column + 1) 

1652 

1653 def _StringParseError(self, e): 

1654 return self.ParseError('Couldn\'t parse string: ' + str(e)) 

1655 

1656 def NextToken(self): 

1657 """Reads the next meaningful token.""" 

1658 self._previous_line = self._line 

1659 self._previous_column = self._column 

1660 self.contains_silent_marker_before_current_token = False 

1661 

1662 self._column += len(self.token) 

1663 self._SkipWhitespace() 

1664 

1665 if not self._more_lines: 

1666 self.token = '' 

1667 return 

1668 

1669 match = self._TOKEN.match(self._current_line, self._column) 

1670 if not match and not self._skip_comments: 

1671 match = self._COMMENT.match(self._current_line, self._column) 

1672 if match: 

1673 token = match.group(0) 

1674 self.token = token 

1675 else: 

1676 self.token = self._current_line[self._column] 

1677 

1678# Aliased so it can still be accessed by current visibility violators. 

1679# TODO: Migrate violators to textformat_tokenizer. 

1680_Tokenizer = Tokenizer # pylint: disable=invalid-name 

1681 

1682 

1683def _ConsumeInt32(tokenizer): 

1684 """Consumes a signed 32bit integer number from tokenizer. 

1685 

1686 Args: 

1687 tokenizer: A tokenizer used to parse the number. 

1688 

1689 Returns: 

1690 The integer parsed. 

1691 

1692 Raises: 

1693 ParseError: If a signed 32bit integer couldn't be consumed. 

1694 """ 

1695 return _ConsumeInteger(tokenizer, is_signed=True, is_long=False) 

1696 

1697 

1698def _ConsumeUint32(tokenizer): 

1699 """Consumes an unsigned 32bit integer number from tokenizer. 

1700 

1701 Args: 

1702 tokenizer: A tokenizer used to parse the number. 

1703 

1704 Returns: 

1705 The integer parsed. 

1706 

1707 Raises: 

1708 ParseError: If an unsigned 32bit integer couldn't be consumed. 

1709 """ 

1710 return _ConsumeInteger(tokenizer, is_signed=False, is_long=False) 

1711 

1712 

1713def _TryConsumeInt64(tokenizer): 

1714 try: 

1715 _ConsumeInt64(tokenizer) 

1716 return True 

1717 except ParseError: 

1718 return False 

1719 

1720 

1721def _ConsumeInt64(tokenizer): 

1722 """Consumes a signed 32bit integer number from tokenizer. 

1723 

1724 Args: 

1725 tokenizer: A tokenizer used to parse the number. 

1726 

1727 Returns: 

1728 The integer parsed. 

1729 

1730 Raises: 

1731 ParseError: If a signed 32bit integer couldn't be consumed. 

1732 """ 

1733 return _ConsumeInteger(tokenizer, is_signed=True, is_long=True) 

1734 

1735 

1736def _TryConsumeUint64(tokenizer): 

1737 try: 

1738 _ConsumeUint64(tokenizer) 

1739 return True 

1740 except ParseError: 

1741 return False 

1742 

1743 

1744def _ConsumeUint64(tokenizer): 

1745 """Consumes an unsigned 64bit integer number from tokenizer. 

1746 

1747 Args: 

1748 tokenizer: A tokenizer used to parse the number. 

1749 

1750 Returns: 

1751 The integer parsed. 

1752 

1753 Raises: 

1754 ParseError: If an unsigned 64bit integer couldn't be consumed. 

1755 """ 

1756 return _ConsumeInteger(tokenizer, is_signed=False, is_long=True) 

1757 

1758 

1759def _ConsumeInteger(tokenizer, is_signed=False, is_long=False): 

1760 """Consumes an integer number from tokenizer. 

1761 

1762 Args: 

1763 tokenizer: A tokenizer used to parse the number. 

1764 is_signed: True if a signed integer must be parsed. 

1765 is_long: True if a long integer must be parsed. 

1766 

1767 Returns: 

1768 The integer parsed. 

1769 

1770 Raises: 

1771 ParseError: If an integer with given characteristics couldn't be consumed. 

1772 """ 

1773 try: 

1774 result = ParseInteger(tokenizer.token, is_signed=is_signed, is_long=is_long) 

1775 except ValueError as e: 

1776 raise tokenizer.ParseError(str(e)) 

1777 tokenizer.NextToken() 

1778 return result 

1779 

1780 

1781def ParseInteger(text, is_signed=False, is_long=False): 

1782 """Parses an integer. 

1783 

1784 Args: 

1785 text: The text to parse. 

1786 is_signed: True if a signed integer must be parsed. 

1787 is_long: True if a long integer must be parsed. 

1788 

1789 Returns: 

1790 The integer value. 

1791 

1792 Raises: 

1793 ValueError: Thrown Iff the text is not a valid integer. 

1794 """ 

1795 # Do the actual parsing. Exception handling is propagated to caller. 

1796 result = _ParseAbstractInteger(text) 

1797 

1798 # Check if the integer is sane. Exceptions handled by callers. 

1799 checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)] 

1800 checker.CheckValue(result) 

1801 return result 

1802 

1803 

1804def _ParseAbstractInteger(text): 

1805 """Parses an integer without checking size/signedness. 

1806 

1807 Args: 

1808 text: The text to parse. 

1809 

1810 Returns: 

1811 The integer value. 

1812 

1813 Raises: 

1814 ValueError: Thrown Iff the text is not a valid integer. 

1815 """ 

1816 # Do the actual parsing. Exception handling is propagated to caller. 

1817 orig_text = text 

1818 c_octal_match = re.match(r'(-?)0(\d+)$', text) 

1819 if c_octal_match: 

1820 # Python 3 no longer supports 0755 octal syntax without the 'o', so 

1821 # we always use the '0o' prefix for multi-digit numbers starting with 0. 

1822 text = c_octal_match.group(1) + '0o' + c_octal_match.group(2) 

1823 try: 

1824 return int(text, 0) 

1825 except ValueError: 

1826 raise ValueError('Couldn\'t parse integer: %s' % orig_text) 

1827 

1828 

1829def ParseFloat(text): 

1830 """Parse a floating point number. 

1831 

1832 Args: 

1833 text: Text to parse. 

1834 

1835 Returns: 

1836 The number parsed. 

1837 

1838 Raises: 

1839 ValueError: If a floating point number couldn't be parsed. 

1840 """ 

1841 if _FLOAT_OCTAL_PREFIX.match(text): 

1842 raise ValueError('Invalid octal float: %s' % text) 

1843 try: 

1844 # Assume Python compatible syntax. 

1845 return float(text) 

1846 except ValueError: 

1847 # Check alternative spellings. 

1848 if _FLOAT_INFINITY.match(text): 

1849 if text[0] == '-': 

1850 return float('-inf') 

1851 else: 

1852 return float('inf') 

1853 elif _FLOAT_NAN.match(text): 

1854 return float('nan') 

1855 else: 

1856 # assume '1.0f' format 

1857 try: 

1858 return float(text.rstrip('fF')) 

1859 except ValueError: 

1860 raise ValueError("Couldn't parse float: %s" % text) 

1861 

1862 

1863def ParseBool(text): 

1864 """Parse a boolean value. 

1865 

1866 Args: 

1867 text: Text to parse. 

1868 

1869 Returns: 

1870 Boolean values parsed 

1871 

1872 Raises: 

1873 ValueError: If text is not a valid boolean. 

1874 """ 

1875 if text in ('true', 't', '1', 'True'): 

1876 return True 

1877 elif text in ('false', 'f', '0', 'False'): 

1878 return False 

1879 else: 

1880 raise ValueError('Expected "true" or "false".') 

1881 

1882 

1883def ParseEnum(field, value): 

1884 """Parse an enum value. 

1885 

1886 The value can be specified by a number (the enum value), or by 

1887 a string literal (the enum name). 

1888 

1889 Args: 

1890 field: Enum field descriptor. 

1891 value: String value. 

1892 

1893 Returns: 

1894 Enum value number. 

1895 

1896 Raises: 

1897 ValueError: If the enum value could not be parsed. 

1898 """ 

1899 enum_descriptor = field.enum_type 

1900 try: 

1901 number = int(value, 0) 

1902 except ValueError: 

1903 # Identifier. 

1904 enum_value = enum_descriptor.values_by_name.get(value, None) 

1905 if enum_value is None: 

1906 raise ValueError('Enum type "%s" has no value named %s.' % 

1907 (enum_descriptor.full_name, value)) 

1908 else: 

1909 if not field.enum_type.is_closed: 

1910 return number 

1911 enum_value = enum_descriptor.values_by_number.get(number, None) 

1912 if enum_value is None: 

1913 raise ValueError('Enum type "%s" has no value with number %d.' % 

1914 (enum_descriptor.full_name, number)) 

1915 return enum_value.number