Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/nameparser/parser.py: 79%

477 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:08 +0000

1# -*- coding: utf-8 -*- 

2from __future__ import unicode_literals 

3 

4import sys 

5import re 

6from operator import itemgetter 

7from itertools import groupby 

8 

9from nameparser.util import u 

10from nameparser.util import text_types, binary_type 

11from nameparser.util import lc 

12from nameparser.util import log 

13from nameparser.config import CONSTANTS 

14from nameparser.config import Constants 

15from nameparser.config import DEFAULT_ENCODING 

16 

17ENCODING = 'utf-8' 

18 

19 

20def group_contiguous_integers(data): 

21 """ 

22 return list of tuples containing first and last index 

23 position of contiguous numbers in a series 

24 """ 

25 ranges = [] 

26 for key, group in groupby(enumerate(data), lambda i: i[0] - i[1]): 

27 group = list(map(itemgetter(1), group)) 

28 if len(group) > 1: 

29 ranges.append((group[0], group[-1])) 

30 return ranges 

31 

32 

33class HumanName(object): 

34 """ 

35 Parse a person's name into individual components. 

36 

37 Instantiation assigns to ``full_name``, and assignment to 

38 :py:attr:`full_name` triggers :py:func:`parse_full_name`. After parsing the 

39 name, these instance attributes are available. Alternatively, you can pass  

40 any of the instance attributes to the constructor method and skip the parsing 

41 process. If any of the the instance attributes are passed to the constructor  

42 as keywords, :py:func:`parse_full_name` will not be performed.  

43 

44 **HumanName Instance Attributes** 

45 

46 * :py:attr:`title` 

47 * :py:attr:`first` 

48 * :py:attr:`middle` 

49 * :py:attr:`last` 

50 * :py:attr:`suffix` 

51 * :py:attr:`nickname` 

52 * :py:attr:`surnames` 

53 

54 :param str full_name: The name string to be parsed. 

55 :param constants constants: 

56 a :py:class:`~nameparser.config.Constants` instance. Pass ``None`` for 

57 `per-instance config <customize.html>`_. 

58 :param str encoding: string representing the encoding of your input 

59 :param str string_format: python string formatting 

60 :param str initials_format: python initials string formatting 

61 :param str initials_delimter: string delimiter for initials 

62 :param str first: first name 

63 :param str middle: middle name 

64 :param str last: last name 

65 :param str title: The title or prenominal 

66 :param str suffix: The suffix or postnominal 

67 :param str nickname: Nicknames 

68 """ 

69 

70 C = CONSTANTS 

71 """ 

72 A reference to the configuration for this instance, which may or may not be 

73 a reference to the shared, module-wide instance at 

74 :py:mod:`~nameparser.config.CONSTANTS`. See `Customizing the Parser 

75 <customize.html>`_. 

76 """ 

77 

78 original = '' 

79 """ 

80 The original string, untouched by the parser. 

81 """ 

82 

83 _count = 0 

84 _members = ['title', 'first', 'middle', 'last', 'suffix', 'nickname'] 

85 unparsable = True 

86 _full_name = '' 

87 

88 def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING, 

89 string_format=None, initials_format=None, initials_delimiter=None, 

90 first=None, middle=None, last=None, title=None, suffix=None, 

91 nickname=None): 

92 self.C = constants 

93 if type(self.C) is not type(CONSTANTS): 

94 self.C = Constants() 

95 

96 self.encoding = encoding 

97 self.string_format = string_format or self.C.string_format 

98 self.initials_format = initials_format or self.C.initials_format 

99 self.initials_delimiter = initials_delimiter or self.C.initials_delimiter 

100 if (first or middle or last or title or suffix or nickname): 

101 self.first = first 

102 self.middle = middle 

103 self.last = last 

104 self.title = title 

105 self.suffix = suffix 

106 self.nickname = nickname 

107 self.unparsable = False 

108 else: 

109 # full_name setter triggers the parse 

110 self.full_name = full_name 

111 

112 def __iter__(self): 

113 return self 

114 

115 def __len__(self): 

116 l = 0 

117 for x in self: 

118 l += 1 

119 return l 

120 

121 def __eq__(self, other): 

122 """ 

123 HumanName instances are equal to other objects whose 

124 lower case unicode representation is the same. 

125 """ 

126 return (u(self)).lower() == (u(other)).lower() 

127 

128 def __ne__(self, other): 

129 return not (u(self)).lower() == (u(other)).lower() 

130 

131 def __getitem__(self, key): 

132 if isinstance(key, slice): 

133 return [getattr(self, x) for x in self._members[key]] 

134 else: 

135 return getattr(self, key) 

136 

137 def __setitem__(self, key, value): 

138 if key in self._members: 

139 self._set_list(key, value) 

140 else: 

141 raise KeyError("Not a valid HumanName attribute", key) 

142 

143 def next(self): 

144 return self.__next__() 

145 

146 def __next__(self): 

147 if self._count >= len(self._members): 

148 self._count = 0 

149 raise StopIteration 

150 else: 

151 c = self._count 

152 self._count = c + 1 

153 return getattr(self, self._members[c]) or next(self) 

154 

155 def __unicode__(self): 

156 if self.string_format: 

157 # string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" 

158 _s = self.string_format.format(**self.as_dict()) 

159 # remove trailing punctuation from missing nicknames 

160 _s = _s.replace(str(self.C.empty_attribute_default), '').replace(" ()", "").replace(" ''", "").replace(' ""', "") 

161 return self.collapse_whitespace(_s).strip(', ') 

162 return " ".join(self) 

163 

164 def __hash__(self): 

165 return hash(str(self)) 

166 

167 def __str__(self): 

168 if sys.version_info[0] >= 3: 

169 return self.__unicode__() 

170 return self.__unicode__().encode(self.encoding) 

171 

172 def __repr__(self): 

173 if self.unparsable: 

174 _string = "<%(class)s : [ Unparsable ] >" % {'class': self.__class__.__name__, } 

175 else: 

176 _string = "<%(class)s : [\n\ttitle: %(title)r \n\tfirst: %(first)r \n\tmiddle: %(middle)r \n\tlast: %(last)r \n\tsuffix: %(suffix)r\n\tnickname: %(nickname)r\n]>" % { 

177 'class': self.__class__.__name__, 

178 'title': self.title or '', 

179 'first': self.first or '', 

180 'middle': self.middle or '', 

181 'last': self.last or '', 

182 'suffix': self.suffix or '', 

183 'nickname': self.nickname or '', 

184 } 

185 if sys.version_info[0] >= 3: 

186 return _string 

187 return _string.encode(self.encoding) 

188 

189 def as_dict(self, include_empty=True): 

190 """ 

191 Return the parsed name as a dictionary of its attributes. 

192 

193 :param bool include_empty: Include keys in the dictionary for empty name attributes. 

194 :rtype: dict 

195 

196 .. doctest:: 

197 

198 >>> name = HumanName("Bob Dole") 

199 >>> name.as_dict() 

200 {'last': 'Dole', 'suffix': '', 'title': '', 'middle': '', 'nickname': '', 'first': 'Bob'} 

201 >>> name.as_dict(False) 

202 {'last': 'Dole', 'first': 'Bob'} 

203 

204 """ 

205 d = {} 

206 for m in self._members: 

207 if include_empty: 

208 d[m] = getattr(self, m) 

209 else: 

210 val = getattr(self, m) 

211 if val: 

212 d[m] = val 

213 return d 

214 

215 def __process_initial__(self, name_part, firstname=False): 

216 """ 

217 Name parts may include prefixes or conjunctions. This function filters these from the name unless it is 

218 a first name, since first names cannot be conjunctions or prefixes. 

219 """ 

220 parts = name_part.split(" ") 

221 initials = [] 

222 if len(parts) and isinstance(parts, list): 

223 for part in parts: 

224 if not (self.is_prefix(part) or self.is_conjunction(part)) or firstname == True: 

225 initials.append(part[0]) 

226 if len(initials) > 0: 

227 return " ".join(initials) 

228 else: 

229 return self.C.empty_attribute_default 

230 

231 def initials_list(self): 

232 """ 

233 Returns the initials as a list 

234 

235 .. doctest:: 

236 

237 >>> name = HumanName("Sir Bob Andrew Dole") 

238 >>> name.initials_list() 

239 ["B", "A", "D"] 

240 >>> name = HumanName("J. Doe") 

241 >>> name.initials_list() 

242 ["J", "D"] 

243 """ 

244 first_initials_list = [self.__process_initial__(name, True) for name in self.first_list if name] 

245 middle_initials_list = [self.__process_initial__(name) for name in self.middle_list if name] 

246 last_initials_list = [self.__process_initial__(name) for name in self.last_list if name] 

247 return first_initials_list + middle_initials_list + last_initials_list 

248 

249 def initials(self): 

250 """ 

251 Return period-delimited initials of the first, middle and optionally last name. 

252 

253 :param bool include_last_name: Include the last name as part of the initials 

254 :rtype: str 

255 

256 .. doctest:: 

257 

258 >>> name = HumanName("Sir Bob Andrew Dole") 

259 >>> name.initials() 

260 "B. A. D." 

261 >>> name = HumanName("Sir Bob Andrew Dole", initials_format="{first} {middle}") 

262 >>> name.initials() 

263 "B. A." 

264 """ 

265 

266 first_initials_list = [self.__process_initial__(name, True) for name in self.first_list if name] 

267 middle_initials_list = [self.__process_initial__(name) for name in self.middle_list if name] 

268 last_initials_list = [self.__process_initial__(name) for name in self.last_list if name] 

269 

270 initials_dict = { 

271 "first": (self.initials_delimiter + " ").join(first_initials_list) + self.initials_delimiter 

272 if len(first_initials_list) else self.C.empty_attribute_default, 

273 "middle": (self.initials_delimiter + " ").join(middle_initials_list) + self.initials_delimiter 

274 if len(middle_initials_list) else self.C.empty_attribute_default, 

275 "last": (self.initials_delimiter + " ").join(last_initials_list) + self.initials_delimiter 

276 if len(last_initials_list) else self.C.empty_attribute_default 

277 } 

278 

279 _s = self.initials_format.format(**initials_dict) 

280 return self.collapse_whitespace(_s) 

281 

282 @property 

283 def has_own_config(self): 

284 """ 

285 True if this instance is not using the shared module-level 

286 configuration. 

287 """ 

288 return self.C is not CONSTANTS 

289 

290 # attributes 

291 

292 @property 

293 def title(self): 

294 """ 

295 The person's titles. Any string of consecutive pieces in 

296 :py:mod:`~nameparser.config.titles` or 

297 :py:mod:`~nameparser.config.conjunctions` 

298 at the beginning of :py:attr:`full_name`. 

299 """ 

300 return " ".join(self.title_list) or self.C.empty_attribute_default 

301 

302 @property 

303 def first(self): 

304 """ 

305 The person's first name. The first name piece after any known 

306 :py:attr:`title` pieces parsed from :py:attr:`full_name`. 

307 """ 

308 return " ".join(self.first_list) or self.C.empty_attribute_default 

309 

310 @property 

311 def middle(self): 

312 """ 

313 The person's middle names. All name pieces after the first name and 

314 before the last name parsed from :py:attr:`full_name`. 

315 """ 

316 return " ".join(self.middle_list) or self.C.empty_attribute_default 

317 

318 @property 

319 def last(self): 

320 """ 

321 The person's last name. The last name piece parsed from 

322 :py:attr:`full_name`. 

323 """ 

324 return " ".join(self.last_list) or self.C.empty_attribute_default 

325 

326 @property 

327 def suffix(self): 

328 """ 

329 The persons's suffixes. Pieces at the end of the name that are found in 

330 :py:mod:`~nameparser.config.suffixes`, or pieces that are at the end 

331 of comma separated formats, e.g. 

332 "Lastname, Title Firstname Middle[,] Suffix [, Suffix]" parsed 

333 from :py:attr:`full_name`. 

334 """ 

335 return ", ".join(self.suffix_list) or self.C.empty_attribute_default 

336 

337 @property 

338 def nickname(self): 

339 """ 

340 The person's nicknames. Any text found inside of quotes (``""``) or 

341 parenthesis (``()``) 

342 """ 

343 return " ".join(self.nickname_list) or self.C.empty_attribute_default 

344 

345 @property 

346 def surnames_list(self): 

347 """ 

348 List of middle names followed by last name. 

349 """ 

350 return self.middle_list + self.last_list 

351 

352 @property 

353 def surnames(self): 

354 """ 

355 A string of all middle names followed by the last name. 

356 """ 

357 return " ".join(self.surnames_list) or self.C.empty_attribute_default 

358 

359 # setter methods 

360 

361 def _set_list(self, attr, value): 

362 if isinstance(value, list): 

363 val = value 

364 elif isinstance(value, text_types): 

365 val = [value] 

366 elif value is None: 

367 val = [] 

368 else: 

369 raise TypeError( 

370 "Can only assign strings, lists or None to name attributes." 

371 " Got {0}".format(type(value))) 

372 setattr(self, attr+"_list", self.parse_pieces(val)) 

373 

374 @title.setter 

375 def title(self, value): 

376 self._set_list('title', value) 

377 

378 @first.setter 

379 def first(self, value): 

380 self._set_list('first', value) 

381 

382 @middle.setter 

383 def middle(self, value): 

384 self._set_list('middle', value) 

385 

386 @last.setter 

387 def last(self, value): 

388 self._set_list('last', value) 

389 

390 @suffix.setter 

391 def suffix(self, value): 

392 self._set_list('suffix', value) 

393 

394 @nickname.setter 

395 def nickname(self, value): 

396 self._set_list('nickname', value) 

397 

398 # Parse helpers 

399 

400 def is_title(self, value): 

401 """Is in the :py:data:`~nameparser.config.titles.TITLES` set.""" 

402 return lc(value) in self.C.titles 

403 

404 def is_conjunction(self, piece): 

405 """Is in the conjunctions set and not :py:func:`is_an_initial()`.""" 

406 if isinstance(piece, list): 

407 for item in piece: 

408 if self.is_conjunction(item): 

409 return True 

410 else: 

411 return piece.lower() in self.C.conjunctions and not self.is_an_initial(piece) 

412 

413 def is_prefix(self, piece): 

414 """ 

415 Lowercase and no periods version of piece is in the 

416 :py:data:`~nameparser.config.prefixes.PREFIXES` set. 

417 """ 

418 if isinstance(piece, list): 

419 for item in piece: 

420 if self.is_prefix(item): 

421 return True 

422 else: 

423 return lc(piece) in self.C.prefixes 

424 

425 def is_roman_numeral(self, value): 

426 """ 

427 Matches the ``roman_numeral`` regular expression in 

428 :py:data:`~nameparser.config.regexes.REGEXES`. 

429 """ 

430 return bool(self.C.regexes.roman_numeral.match(value)) 

431 

432 def is_suffix(self, piece): 

433 """ 

434 Is in the suffixes set and not :py:func:`is_an_initial()`. 

435 

436 Some suffixes may be acronyms (M.B.A) while some are not (Jr.), 

437 so we remove the periods from `piece` when testing against 

438 `C.suffix_acronyms`. 

439 """ 

440 # suffixes may have periods inside them like "M.D." 

441 if isinstance(piece, list): 

442 for item in piece: 

443 if self.is_suffix(item): 

444 return True 

445 else: 

446 return ((lc(piece).replace('.', '') in self.C.suffix_acronyms) 

447 or (lc(piece) in self.C.suffix_not_acronyms)) \ 

448 and not self.is_an_initial(piece) 

449 

450 def are_suffixes(self, pieces): 

451 """Return True if all pieces are suffixes.""" 

452 for piece in pieces: 

453 if not self.is_suffix(piece): 

454 return False 

455 return True 

456 

457 def is_rootname(self, piece): 

458 """ 

459 Is not a known title, suffix or prefix. Just first, middle, last names. 

460 """ 

461 return lc(piece) not in self.C.suffixes_prefixes_titles \ 

462 and not self.is_an_initial(piece) 

463 

464 def is_an_initial(self, value): 

465 """ 

466 Words with a single period at the end, or a single uppercase letter. 

467 

468 Matches the ``initial`` regular expression in 

469 :py:data:`~nameparser.config.regexes.REGEXES`. 

470 """ 

471 return bool(self.C.regexes.initial.match(value)) 

472 

473 # full_name parser 

474 

475 @property 

476 def full_name(self): 

477 """The string output of the HumanName instance.""" 

478 return self.__str__() 

479 

480 @full_name.setter 

481 def full_name(self, value): 

482 self.original = value 

483 self._full_name = value 

484 if isinstance(value, binary_type): 

485 self._full_name = value.decode(self.encoding) 

486 self.parse_full_name() 

487 

488 def collapse_whitespace(self, string): 

489 # collapse multiple spaces into single space 

490 string = self.C.regexes.spaces.sub(" ", string.strip()) 

491 if string.endswith(","): 

492 string = string[:-1] 

493 return string 

494 

495 def pre_process(self): 

496 """ 

497 

498 This method happens at the beginning of the :py:func:`parse_full_name` 

499 before any other processing of the string aside from unicode 

500 normalization, so it's a good place to do any custom handling in a 

501 subclass. Runs :py:func:`parse_nicknames` and :py:func:`squash_emoji`. 

502 

503 """ 

504 self.fix_phd() 

505 self.parse_nicknames() 

506 self.squash_emoji() 

507 

508 def post_process(self): 

509 """ 

510 This happens at the end of the :py:func:`parse_full_name` after 

511 all other processing has taken place. Runs :py:func:`handle_firstnames` 

512 and :py:func:`handle_capitalization`. 

513 """ 

514 self.handle_firstnames() 

515 self.handle_capitalization() 

516 

517 def fix_phd(self): 

518 try: 

519 _re = self.C.regexes.phd 

520 match = _re.search(self._full_name) 

521 if match: 

522 self.suffix_list.append(match.group(1)) 

523 self._full_name = _re.sub('', self._full_name) 

524 except AttributeError: 

525 pass 

526 

527 def parse_nicknames(self): 

528 """ 

529 The content of parenthesis or quotes in the name will be added to the 

530 nicknames list. This happens before any other processing of the name. 

531 

532 Single quotes cannot span white space characters and must border 

533 white space to allow for quotes in names like O'Connor and Kawai'ae'a. 

534 Double quotes and parenthesis can span white space. 

535 

536 Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`; 

537 `quoted_word`, `double_quotes` and `parenthesis`. 

538 """ 

539 

540 empty_re = re.compile("") 

541 

542 re_quoted_word = self.C.regexes.quoted_word or empty_re 

543 re_double_quotes = self.C.regexes.double_quotes or empty_re 

544 re_parenthesis = self.C.regexes.parenthesis or empty_re 

545 

546 for _re in (re_quoted_word, re_double_quotes, re_parenthesis): 

547 if _re.search(self._full_name): 

548 self.nickname_list += [x for x in _re.findall(self._full_name)] 

549 self._full_name = _re.sub('', self._full_name) 

550 

551 def squash_emoji(self): 

552 """ 

553 Remove emoji from the input string. 

554 """ 

555 re_emoji = self.C.regexes.emoji 

556 if re_emoji and re_emoji.search(self._full_name): 

557 self._full_name = re_emoji.sub('', self._full_name) 

558 

559 def handle_firstnames(self): 

560 """ 

561 If there are only two parts and one is a title, assume it's a last name 

562 instead of a first name. e.g. Mr. Johnson. Unless it's a special title 

563 like "Sir", then when it's followed by a single name that name is always 

564 a first name. 

565 """ 

566 if self.title \ 

567 and len(self) == 2 \ 

568 and not lc(self.title) in self.C.first_name_titles: 

569 self.last, self.first = self.first, self.last 

570 

571 def parse_full_name(self): 

572 """ 

573 

574 The main parse method for the parser. This method is run upon 

575 assignment to the :py:attr:`full_name` attribute or instantiation. 

576 

577 Basic flow is to hand off to :py:func:`pre_process` to handle 

578 nicknames. It then splits on commas and chooses a code path depending 

579 on the number of commas. 

580 

581 :py:func:`parse_pieces` then splits those parts on spaces and 

582 :py:func:`join_on_conjunctions` joins any pieces next to conjunctions. 

583 """ 

584 

585 self.title_list = [] 

586 self.first_list = [] 

587 self.middle_list = [] 

588 self.last_list = [] 

589 self.suffix_list = [] 

590 self.nickname_list = [] 

591 self.unparsable = True 

592 

593 self.pre_process() 

594 

595 self._full_name = self.collapse_whitespace(self._full_name) 

596 

597 # break up full_name by commas 

598 parts = [x.strip() for x in self._full_name.split(",")] 

599 

600 log.debug("full_name: %s", self._full_name) 

601 log.debug("parts: %s", parts) 

602 

603 if len(parts) == 1: 

604 

605 # no commas, title first middle middle middle last suffix 

606 # part[0] 

607 

608 pieces = self.parse_pieces(parts) 

609 p_len = len(pieces) 

610 for i, piece in enumerate(pieces): 

611 try: 

612 nxt = pieces[i + 1] 

613 except IndexError: 

614 nxt = None 

615 

616 # title must have a next piece, unless it's just a title 

617 if not self.first \ 

618 and (nxt or p_len == 1) \ 

619 and self.is_title(piece): 

620 self.title_list.append(piece) 

621 continue 

622 if not self.first: 

623 if p_len == 1 and self.nickname: 

624 self.last_list.append(piece) 

625 continue 

626 self.first_list.append(piece) 

627 continue 

628 if self.are_suffixes(pieces[i+1:]) or \ 

629 ( 

630 # if the next piece is the last piece and a roman 

631 # numeral but this piece is not an initial 

632 self.is_roman_numeral(nxt) and i == p_len - 2 

633 and not self.is_an_initial(piece) 

634 ): 

635 self.last_list.append(piece) 

636 self.suffix_list += pieces[i+1:] 

637 break 

638 if not nxt: 

639 self.last_list.append(piece) 

640 continue 

641 

642 self.middle_list.append(piece) 

643 else: 

644 # if all the end parts are suffixes and there is more than one piece 

645 # in the first part. (Suffixes will never appear after last names 

646 # only, and allows potential first names to be in suffixes, e.g. 

647 # "Johnson, Bart" 

648 

649 post_comma_pieces = self.parse_pieces(parts[1].split(' '), 1) 

650 

651 if self.are_suffixes(parts[1].split(' ')) \ 

652 and len(parts[0].split(' ')) > 1: 

653 

654 # suffix comma: 

655 # title first middle last [suffix], suffix [suffix] [, suffix] 

656 # parts[0], parts[1:...] 

657 

658 self.suffix_list += parts[1:] 

659 pieces = self.parse_pieces(parts[0].split(' ')) 

660 log.debug("pieces: %s", u(pieces)) 

661 for i, piece in enumerate(pieces): 

662 try: 

663 nxt = pieces[i + 1] 

664 except IndexError: 

665 nxt = None 

666 

667 if not self.first \ 

668 and (nxt or len(pieces) == 1) \ 

669 and self.is_title(piece): 

670 self.title_list.append(piece) 

671 continue 

672 if not self.first: 

673 self.first_list.append(piece) 

674 continue 

675 if self.are_suffixes(pieces[i+1:]): 

676 self.last_list.append(piece) 

677 self.suffix_list = pieces[i+1:] + self.suffix_list 

678 break 

679 if not nxt: 

680 self.last_list.append(piece) 

681 continue 

682 self.middle_list.append(piece) 

683 else: 

684 

685 # lastname comma: 

686 # last [suffix], title first middles[,] suffix [,suffix] 

687 # parts[0], parts[1], parts[2:...] 

688 

689 log.debug("post-comma pieces: %s", u(post_comma_pieces)) 

690 

691 # lastname part may have suffixes in it 

692 lastname_pieces = self.parse_pieces(parts[0].split(' '), 1) 

693 for piece in lastname_pieces: 

694 # the first one is always a last name, even if it looks like 

695 # a suffix 

696 if self.is_suffix(piece) and len(self.last_list) > 0: 

697 self.suffix_list.append(piece) 

698 else: 

699 self.last_list.append(piece) 

700 

701 for i, piece in enumerate(post_comma_pieces): 

702 try: 

703 nxt = post_comma_pieces[i + 1] 

704 except IndexError: 

705 nxt = None 

706 

707 if not self.first \ 

708 and (nxt or len(post_comma_pieces) == 1) \ 

709 and self.is_title(piece): 

710 self.title_list.append(piece) 

711 continue 

712 if not self.first: 

713 self.first_list.append(piece) 

714 continue 

715 if self.is_suffix(piece): 

716 self.suffix_list.append(piece) 

717 continue 

718 self.middle_list.append(piece) 

719 try: 

720 if parts[2]: 

721 self.suffix_list += parts[2:] 

722 except IndexError: 

723 pass 

724 

725 if len(self) < 0: 

726 log.info("Unparsable: \"%s\" ", self.original) 

727 else: 

728 self.unparsable = False 

729 self.post_process() 

730 

731 def parse_pieces(self, parts, additional_parts_count=0): 

732 """ 

733 Split parts on spaces and remove commas, join on conjunctions and 

734 lastname prefixes. If parts have periods in the middle, try splitting 

735 on periods and check if the parts are titles or suffixes. If they are 

736 add to the constant so they will be found. 

737 

738 :param list parts: name part strings from the comma split 

739 :param int additional_parts_count: 

740 

741 if the comma format contains other parts, we need to know 

742 how many there are to decide if things should be considered a 

743 conjunction. 

744 :return: pieces split on spaces and joined on conjunctions 

745 :rtype: list 

746 """ 

747 

748 output = [] 

749 for part in parts: 

750 if not isinstance(part, text_types): 

751 raise TypeError("Name parts must be strings. " 

752 "Got {0}".format(type(part))) 

753 output += [x.strip(' ,') for x in part.split(' ')] 

754 

755 # If part contains periods, check if it's multiple titles or suffixes 

756 # together without spaces if so, add the new part with periods to the 

757 # constants so they get parsed correctly later 

758 for part in output: 

759 # if this part has a period not at the beginning or end 

760 if self.C.regexes.period_not_at_end and self.C.regexes.period_not_at_end.match(part): 

761 # split on periods, any of the split pieces titles or suffixes? 

762 # ("Lt.Gov.") 

763 period_chunks = part.split(".") 

764 titles = list(filter(self.is_title, period_chunks)) 

765 suffixes = list(filter(self.is_suffix, period_chunks)) 

766 

767 # add the part to the constant so it will be found 

768 if len(list(titles)): 

769 self.C.titles.add(part) 

770 continue 

771 if len(list(suffixes)): 

772 self.C.suffix_not_acronyms.add(part) 

773 continue 

774 

775 return self.join_on_conjunctions(output, additional_parts_count) 

776 

777 def join_on_conjunctions(self, pieces, additional_parts_count=0): 

778 """ 

779 Join conjunctions to surrounding pieces. Title- and prefix-aware. e.g.: 

780 

781 ['Mr.', 'and'. 'Mrs.', 'John', 'Doe'] ==> 

782 ['Mr. and Mrs.', 'John', 'Doe'] 

783 

784 ['The', 'Secretary', 'of', 'State', 'Hillary', 'Clinton'] ==> 

785 ['The Secretary of State', 'Hillary', 'Clinton'] 

786 

787 When joining titles, saves newly formed piece to the instance's titles 

788 constant so they will be parsed correctly later. E.g. after parsing the 

789 example names above, 'The Secretary of State' and 'Mr. and Mrs.' would 

790 be present in the titles constant set. 

791 

792 :param list pieces: name pieces strings after split on spaces 

793 :param int additional_parts_count: 

794 :return: new list with piece next to conjunctions merged into one piece 

795 with spaces in it. 

796 :rtype: list 

797 

798 """ 

799 length = len(pieces) + additional_parts_count 

800 # don't join on conjunctions if there's only 2 parts 

801 if length < 3: 

802 return pieces 

803 

804 rootname_pieces = [p for p in pieces if self.is_rootname(p)] 

805 total_length = len(rootname_pieces) + additional_parts_count 

806 

807 # find all the conjunctions, join any conjunctions that are next to each 

808 # other, then join those newly joined conjunctions and any single 

809 # conjunctions to the piece before and after it 

810 conj_index = [i for i, piece in enumerate(pieces) 

811 if self.is_conjunction(piece)] 

812 

813 contiguous_conj_i = [] 

814 for i, val in enumerate(conj_index): 

815 try: 

816 if conj_index[i+1] == val+1: 

817 contiguous_conj_i += [val] 

818 except IndexError: 

819 pass 

820 

821 contiguous_conj_i = group_contiguous_integers(conj_index) 

822 

823 delete_i = [] 

824 for i in contiguous_conj_i: 

825 if type(i) == tuple: 

826 new_piece = " ".join(pieces[i[0]: i[1]+1]) 

827 delete_i += list(range(i[0]+1, i[1]+1)) 

828 pieces[i[0]] = new_piece 

829 else: 

830 new_piece = " ".join(pieces[i: i+2]) 

831 delete_i += [i+1] 

832 pieces[i] = new_piece 

833 # add newly joined conjunctions to constants to be found later 

834 self.C.conjunctions.add(new_piece) 

835 

836 for i in reversed(delete_i): 

837 # delete pieces in reverse order or the index changes on each delete 

838 del pieces[i] 

839 

840 if len(pieces) == 1: 

841 # if there's only one piece left, nothing left to do 

842 return pieces 

843 

844 # refresh conjunction index locations 

845 conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)] 

846 

847 for i in conj_index: 

848 if len(pieces[i]) == 1 and total_length < 4: 

849 # if there are only 3 total parts (minus known titles, suffixes 

850 # and prefixes) and this conjunction is a single letter, prefer 

851 # treating it as an initial rather than a conjunction. 

852 # http://code.google.com/p/python-nameparser/issues/detail?id=11 

853 continue 

854 

855 if i == 0: 

856 new_piece = " ".join(pieces[i:i+2]) 

857 if self.is_title(pieces[i+1]): 

858 # when joining to a title, make new_piece a title too 

859 self.C.titles.add(new_piece) 

860 pieces[i] = new_piece 

861 pieces.pop(i+1) 

862 # subtract 1 from the index of all the remaining conjunctions 

863 for j, val in enumerate(conj_index): 

864 if val > i: 

865 conj_index[j] = val-1 

866 

867 else: 

868 new_piece = " ".join(pieces[i-1:i+2]) 

869 if self.is_title(pieces[i-1]): 

870 # when joining to a title, make new_piece a title too 

871 self.C.titles.add(new_piece) 

872 pieces[i-1] = new_piece 

873 pieces.pop(i) 

874 rm_count = 2 

875 try: 

876 pieces.pop(i) 

877 except IndexError: 

878 rm_count = 1 

879 

880 # subtract the number of removed pieces from the index 

881 # of all the remaining conjunctions 

882 for j, val in enumerate(conj_index): 

883 if val > i: 

884 conj_index[j] = val - rm_count 

885 

886 # join prefixes to following lastnames: ['de la Vega'], ['van Buren'] 

887 prefixes = list(filter(self.is_prefix, pieces)) 

888 if prefixes: 

889 for prefix in prefixes: 

890 try: 

891 i = pieces.index(prefix) 

892 except ValueError: 

893 # If the prefix is no longer in pieces, it's because it has been 

894 # combined with the prefix that appears right before (or before that when 

895 # chained together) in the last loop, so the index of that newly created 

896 # piece is the same as in the last loop, i==i still, and we want to join 

897 # it to the next piece. 

898 pass 

899 

900 new_piece = '' 

901 

902 # join everything after the prefix until the next prefix or suffix 

903 

904 try: 

905 if i == 0 and total_length >= 1: 

906 # If it's the first piece and there are more than 1 rootnames, assume it's a first name 

907 continue 

908 next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:]))) 

909 j = pieces.index(next_prefix) 

910 if j == i + 1: 

911 # if there are two prefixes in sequence, join to the following piece 

912 j += 1 

913 new_piece = ' '.join(pieces[i:j]) 

914 pieces = pieces[:i] + [new_piece] + pieces[j:] 

915 except StopIteration: 

916 try: 

917 # if there are no more prefixes, look for a suffix to stop at 

918 stop_at = next(iter(filter(self.is_suffix, pieces[i + 1:]))) 

919 j = pieces.index(stop_at) 

920 new_piece = ' '.join(pieces[i:j]) 

921 pieces = pieces[:i] + [new_piece] + pieces[j:] 

922 except StopIteration: 

923 # if there were no suffixes, nothing to stop at so join all 

924 # remaining pieces 

925 new_piece = ' '.join(pieces[i:]) 

926 pieces = pieces[:i] + [new_piece] 

927 

928 log.debug("pieces: %s", pieces) 

929 return pieces 

930 

931 # Capitalization Support 

932 

933 def cap_word(self, word, attribute): 

934 if (self.is_prefix(word) and attribute in ('last', 'middle')) \ 

935 or self.is_conjunction(word): 

936 return word.lower() 

937 exceptions = self.C.capitalization_exceptions 

938 if lc(word) in exceptions: 

939 return exceptions[lc(word)] 

940 mac_match = self.C.regexes.mac.match(word) 

941 if mac_match: 

942 def cap_after_mac(m): 

943 return m.group(1).capitalize() + m.group(2).capitalize() 

944 return self.C.regexes.mac.sub(cap_after_mac, word) 

945 else: 

946 return word.capitalize() 

947 

948 def cap_piece(self, piece, attribute): 

949 if not piece: 

950 return "" 

951 

952 def replacement(m): return self.cap_word(m.group(0), attribute) 

953 return self.C.regexes.word.sub(replacement, piece) 

954 

955 def capitalize(self, force=None): 

956 """ 

957 The HumanName class can try to guess the correct capitalization of name 

958 entered in all upper or lower case. By default, it will not adjust the 

959 case of names entered in mixed case. To run capitalization on all names 

960 pass the parameter `force=True`. 

961 

962 :param bool force: Forces capitalization of mixed case strings. This 

963 parameter overrides rules set within 

964 :py:class:`~nameparser.config.CONSTANTS`. 

965 

966 **Usage** 

967 

968 .. doctest:: capitalize 

969 

970 >>> name = HumanName('bob v. de la macdole-eisenhower phd') 

971 >>> name.capitalize() 

972 >>> str(name) 

973 'Bob V. de la MacDole-Eisenhower Ph.D.' 

974 >>> # Don't touch good names 

975 >>> name = HumanName('Shirley Maclaine') 

976 >>> name.capitalize() 

977 >>> str(name) 

978 'Shirley Maclaine' 

979 >>> name.capitalize(force=True) 

980 >>> str(name) 

981 'Shirley MacLaine' 

982 

983 """ 

984 name = u(self) 

985 force = self.C.force_mixed_case_capitalization \ 

986 if force is None else force 

987 

988 if not force and not (name == name.upper() or name == name.lower()): 

989 return 

990 self.title_list = self.cap_piece(self.title, 'title').split(' ') 

991 self.first_list = self.cap_piece(self.first, 'first').split(' ') 

992 self.middle_list = self.cap_piece(self.middle, 'middle').split(' ') 

993 self.last_list = self.cap_piece(self.last, 'last').split(' ') 

994 self.suffix_list = self.cap_piece(self.suffix, 'suffix').split(', ') 

995 

996 def handle_capitalization(self): 

997 """ 

998 Handles capitalization configurations set within 

999 :py:class:`~nameparser.config.CONSTANTS`. 

1000 """ 

1001 if self.C.capitalize_name: 

1002 self.capitalize()