Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/nameparser/parser.py: 79%

1# -*- coding: utf-8 -*-

2from __future__ import unicode_literals

4import sys

5import re

6from operator import itemgetter

7from itertools import groupby

9from nameparser.util import u

10from nameparser.util import text_types, binary_type

11from nameparser.util import lc

12from nameparser.util import log

13from nameparser.config import CONSTANTS

14from nameparser.config import Constants

15from nameparser.config import DEFAULT_ENCODING

17ENCODING = 'utf-8'

20def group_contiguous_integers(data):

21 """

22 return list of tuples containing first and last index

23 position of contiguous numbers in a series

24 """

25 ranges = []

26 for key, group in groupby(enumerate(data), lambda i: i[0] - i[1]):

27 group = list(map(itemgetter(1), group))

28 if len(group) > 1:

29 ranges.append((group[0], group[-1]))

30 return ranges

33class HumanName(object):

34 """

35 Parse a person's name into individual components.

37 Instantiation assigns to ``full_name``, and assignment to

38 :py:attr:`full_name` triggers :py:func:`parse_full_name`. After parsing the

39 name, these instance attributes are available. Alternatively, you can pass

40 any of the instance attributes to the constructor method and skip the parsing

41 process. If any of the the instance attributes are passed to the constructor

42 as keywords, :py:func:`parse_full_name` will not be performed.

44 **HumanName Instance Attributes**

46 * :py:attr:`title`

47 * :py:attr:`first`

48 * :py:attr:`middle`

49 * :py:attr:`last`

50 * :py:attr:`suffix`

51 * :py:attr:`nickname`

52 * :py:attr:`surnames`

54 :param str full_name: The name string to be parsed.

55 :param constants constants:

56 a :py:class:`~nameparser.config.Constants` instance. Pass ``None`` for

57 `per-instance config <customize.html>`_.

58 :param str encoding: string representing the encoding of your input

59 :param str string_format: python string formatting

60 :param str initials_format: python initials string formatting

61 :param str initials_delimter: string delimiter for initials

62 :param str first: first name

63 :param str middle: middle name

64 :param str last: last name

65 :param str title: The title or prenominal

66 :param str suffix: The suffix or postnominal

67 :param str nickname: Nicknames

68 """

70 C = CONSTANTS

71 """

72 A reference to the configuration for this instance, which may or may not be

73 a reference to the shared, module-wide instance at

74 :py:mod:`~nameparser.config.CONSTANTS`. See `Customizing the Parser

75 <customize.html>`_.

76 """

78 original = ''

79 """

80 The original string, untouched by the parser.

81 """

83 _count = 0

84 _members = ['title', 'first', 'middle', 'last', 'suffix', 'nickname']

85 unparsable = True

86 _full_name = ''

88 def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING,

89 string_format=None, initials_format=None, initials_delimiter=None,

90 first=None, middle=None, last=None, title=None, suffix=None,

91 nickname=None):

92 self.C = constants

93 if type(self.C) is not type(CONSTANTS):

94 self.C = Constants()

96 self.encoding = encoding

97 self.string_format = string_format or self.C.string_format

98 self.initials_format = initials_format or self.C.initials_format

99 self.initials_delimiter = initials_delimiter or self.C.initials_delimiter

100 if (first or middle or last or title or suffix or nickname):

101 self.first = first

102 self.middle = middle

103 self.last = last

104 self.title = title

105 self.suffix = suffix

106 self.nickname = nickname

107 self.unparsable = False

108 else:

109 # full_name setter triggers the parse

110 self.full_name = full_name

111

112 def __iter__(self):

113 return self

114

115 def __len__(self):

116 l = 0

117 for x in self:

118 l += 1

119 return l

120

121 def __eq__(self, other):

122 """

123 HumanName instances are equal to other objects whose

124 lower case unicode representation is the same.

125 """

126 return (u(self)).lower() == (u(other)).lower()

127

128 def __ne__(self, other):

129 return not (u(self)).lower() == (u(other)).lower()

130

131 def __getitem__(self, key):

132 if isinstance(key, slice):

133 return [getattr(self, x) for x in self._members[key]]

134 else:

135 return getattr(self, key)

136

137 def __setitem__(self, key, value):

138 if key in self._members:

139 self._set_list(key, value)

140 else:

141 raise KeyError("Not a valid HumanName attribute", key)

142

143 def next(self):

144 return self.__next__()

145

146 def __next__(self):

147 if self._count >= len(self._members):

148 self._count = 0

149 raise StopIteration

150 else:

151 c = self._count

152 self._count = c + 1

153 return getattr(self, self._members[c]) or next(self)

154

155 def __unicode__(self):

156 if self.string_format:

157 # string_format = "{title} {first} {middle} {last} {suffix} ({nickname})"

158 _s = self.string_format.format(**self.as_dict())

159 # remove trailing punctuation from missing nicknames

160 _s = _s.replace(str(self.C.empty_attribute_default), '').replace(" ()", "").replace(" ''", "").replace(' ""', "")

161 return self.collapse_whitespace(_s).strip(', ')

162 return " ".join(self)

163

164 def __hash__(self):

165 return hash(str(self))

166

167 def __str__(self):

168 if sys.version_info[0] >= 3:

169 return self.__unicode__()

170 return self.__unicode__().encode(self.encoding)

171

172 def __repr__(self):

173 if self.unparsable:

174 _string = "<%(class)s : [ Unparsable ] >" % {'class': self.__class__.__name__, }

175 else:

176 _string = "<%(class)s : [\n\ttitle: %(title)r \n\tfirst: %(first)r \n\tmiddle: %(middle)r \n\tlast: %(last)r \n\tsuffix: %(suffix)r\n\tnickname: %(nickname)r\n]>" % {

177 'class': self.__class__.__name__,

178 'title': self.title or '',

179 'first': self.first or '',

180 'middle': self.middle or '',

181 'last': self.last or '',

182 'suffix': self.suffix or '',

183 'nickname': self.nickname or '',

184 }

185 if sys.version_info[0] >= 3:

186 return _string

187 return _string.encode(self.encoding)

188

189 def as_dict(self, include_empty=True):

190 """

191 Return the parsed name as a dictionary of its attributes.

192

193 :param bool include_empty: Include keys in the dictionary for empty name attributes.

194 :rtype: dict

195

196 .. doctest::

197

198 >>> name = HumanName("Bob Dole")

199 >>> name.as_dict()

200 {'last': 'Dole', 'suffix': '', 'title': '', 'middle': '', 'nickname': '', 'first': 'Bob'}

201 >>> name.as_dict(False)

202 {'last': 'Dole', 'first': 'Bob'}

203

204 """

205 d = {}

206 for m in self._members:

207 if include_empty:

208 d[m] = getattr(self, m)

209 else:

210 val = getattr(self, m)

211 if val:

212 d[m] = val

213 return d

214

215 def __process_initial__(self, name_part, firstname=False):

216 """

217 Name parts may include prefixes or conjunctions. This function filters these from the name unless it is

218 a first name, since first names cannot be conjunctions or prefixes.

219 """

220 parts = name_part.split(" ")

221 initials = []

222 if len(parts) and isinstance(parts, list):

223 for part in parts:

224 if not (self.is_prefix(part) or self.is_conjunction(part)) or firstname == True:

225 initials.append(part[0])

226 if len(initials) > 0:

227 return " ".join(initials)

228 else:

229 return self.C.empty_attribute_default

230

231 def initials_list(self):

232 """

233 Returns the initials as a list

234

235 .. doctest::

236

237 >>> name = HumanName("Sir Bob Andrew Dole")

238 >>> name.initials_list()

239 ["B", "A", "D"]

240 >>> name = HumanName("J. Doe")

241 >>> name.initials_list()

242 ["J", "D"]

243 """

244 first_initials_list = [self.__process_initial__(name, True) for name in self.first_list if name]

245 middle_initials_list = [self.__process_initial__(name) for name in self.middle_list if name]

246 last_initials_list = [self.__process_initial__(name) for name in self.last_list if name]

247 return first_initials_list + middle_initials_list + last_initials_list

248

249 def initials(self):

250 """

251 Return period-delimited initials of the first, middle and optionally last name.

252

253 :param bool include_last_name: Include the last name as part of the initials

254 :rtype: str

255

256 .. doctest::

257

258 >>> name = HumanName("Sir Bob Andrew Dole")

259 >>> name.initials()

260 "B. A. D."

261 >>> name = HumanName("Sir Bob Andrew Dole", initials_format="{first} {middle}")

262 >>> name.initials()

263 "B. A."

264 """

265

266 first_initials_list = [self.__process_initial__(name, True) for name in self.first_list if name]

267 middle_initials_list = [self.__process_initial__(name) for name in self.middle_list if name]

268 last_initials_list = [self.__process_initial__(name) for name in self.last_list if name]

269

270 initials_dict = {

271 "first": (self.initials_delimiter + " ").join(first_initials_list) + self.initials_delimiter

272 if len(first_initials_list) else self.C.empty_attribute_default,

273 "middle": (self.initials_delimiter + " ").join(middle_initials_list) + self.initials_delimiter

274 if len(middle_initials_list) else self.C.empty_attribute_default,

275 "last": (self.initials_delimiter + " ").join(last_initials_list) + self.initials_delimiter

276 if len(last_initials_list) else self.C.empty_attribute_default

277 }

278

279 _s = self.initials_format.format(**initials_dict)

280 return self.collapse_whitespace(_s)

281

282 @property

283 def has_own_config(self):

284 """

285 True if this instance is not using the shared module-level

286 configuration.

287 """

288 return self.C is not CONSTANTS

289

290 # attributes

291

292 @property

293 def title(self):

294 """

295 The person's titles. Any string of consecutive pieces in

296 :py:mod:`~nameparser.config.titles` or

297 :py:mod:`~nameparser.config.conjunctions`

298 at the beginning of :py:attr:`full_name`.

299 """

300 return " ".join(self.title_list) or self.C.empty_attribute_default

301

302 @property

303 def first(self):

304 """

305 The person's first name. The first name piece after any known

306 :py:attr:`title` pieces parsed from :py:attr:`full_name`.

307 """

308 return " ".join(self.first_list) or self.C.empty_attribute_default

309

310 @property

311 def middle(self):

312 """

313 The person's middle names. All name pieces after the first name and

314 before the last name parsed from :py:attr:`full_name`.

315 """

316 return " ".join(self.middle_list) or self.C.empty_attribute_default

317

318 @property

319 def last(self):

320 """

321 The person's last name. The last name piece parsed from

322 :py:attr:`full_name`.

323 """

324 return " ".join(self.last_list) or self.C.empty_attribute_default

325

326 @property

327 def suffix(self):

328 """

329 The persons's suffixes. Pieces at the end of the name that are found in

330 :py:mod:`~nameparser.config.suffixes`, or pieces that are at the end

331 of comma separated formats, e.g.

332 "Lastname, Title Firstname Middle[,] Suffix [, Suffix]" parsed

333 from :py:attr:`full_name`.

334 """

335 return ", ".join(self.suffix_list) or self.C.empty_attribute_default

336

337 @property

338 def nickname(self):

339 """

340 The person's nicknames. Any text found inside of quotes (``""``) or

341 parenthesis (``()``)

342 """

343 return " ".join(self.nickname_list) or self.C.empty_attribute_default

344

345 @property

346 def surnames_list(self):

347 """

348 List of middle names followed by last name.

349 """

350 return self.middle_list + self.last_list

351

352 @property

353 def surnames(self):

354 """

355 A string of all middle names followed by the last name.

356 """

357 return " ".join(self.surnames_list) or self.C.empty_attribute_default

358

359 # setter methods

360

361 def _set_list(self, attr, value):

362 if isinstance(value, list):

363 val = value

364 elif isinstance(value, text_types):

365 val = [value]

366 elif value is None:

367 val = []

368 else:

369 raise TypeError(

370 "Can only assign strings, lists or None to name attributes."

371 " Got {0}".format(type(value)))

372 setattr(self, attr+"_list", self.parse_pieces(val))

373

374 @title.setter

375 def title(self, value):

376 self._set_list('title', value)

377

378 @first.setter

379 def first(self, value):

380 self._set_list('first', value)

381

382 @middle.setter

383 def middle(self, value):

384 self._set_list('middle', value)

385

386 @last.setter

387 def last(self, value):

388 self._set_list('last', value)

389

390 @suffix.setter

391 def suffix(self, value):

392 self._set_list('suffix', value)

393

394 @nickname.setter

395 def nickname(self, value):

396 self._set_list('nickname', value)

397

398 # Parse helpers

399

400 def is_title(self, value):

401 """Is in the :py:data:`~nameparser.config.titles.TITLES` set."""

402 return lc(value) in self.C.titles

403

404 def is_conjunction(self, piece):

405 """Is in the conjunctions set and not :py:func:`is_an_initial()`."""

406 if isinstance(piece, list):

407 for item in piece:

408 if self.is_conjunction(item):

409 return True

410 else:

411 return piece.lower() in self.C.conjunctions and not self.is_an_initial(piece)

412

413 def is_prefix(self, piece):

414 """

415 Lowercase and no periods version of piece is in the

416 :py:data:`~nameparser.config.prefixes.PREFIXES` set.

417 """

418 if isinstance(piece, list):

419 for item in piece:

420 if self.is_prefix(item):

421 return True

422 else:

423 return lc(piece) in self.C.prefixes

424

425 def is_roman_numeral(self, value):

426 """

427 Matches the ``roman_numeral`` regular expression in

428 :py:data:`~nameparser.config.regexes.REGEXES`.

429 """

430 return bool(self.C.regexes.roman_numeral.match(value))

431

432 def is_suffix(self, piece):

433 """

434 Is in the suffixes set and not :py:func:`is_an_initial()`.

435

436 Some suffixes may be acronyms (M.B.A) while some are not (Jr.),

437 so we remove the periods from `piece` when testing against

438 `C.suffix_acronyms`.

439 """

440 # suffixes may have periods inside them like "M.D."

441 if isinstance(piece, list):

442 for item in piece:

443 if self.is_suffix(item):

444 return True

445 else:

446 return ((lc(piece).replace('.', '') in self.C.suffix_acronyms)

447 or (lc(piece) in self.C.suffix_not_acronyms)) \

448 and not self.is_an_initial(piece)

449

450 def are_suffixes(self, pieces):

451 """Return True if all pieces are suffixes."""

452 for piece in pieces:

453 if not self.is_suffix(piece):

454 return False

455 return True

456

457 def is_rootname(self, piece):

458 """

459 Is not a known title, suffix or prefix. Just first, middle, last names.

460 """

461 return lc(piece) not in self.C.suffixes_prefixes_titles \

462 and not self.is_an_initial(piece)

463

464 def is_an_initial(self, value):

465 """

466 Words with a single period at the end, or a single uppercase letter.

467

468 Matches the ``initial`` regular expression in

469 :py:data:`~nameparser.config.regexes.REGEXES`.

470 """

471 return bool(self.C.regexes.initial.match(value))

472

473 # full_name parser

474

475 @property

476 def full_name(self):

477 """The string output of the HumanName instance."""

478 return self.__str__()

479

480 @full_name.setter

481 def full_name(self, value):

482 self.original = value

483 self._full_name = value

484 if isinstance(value, binary_type):

485 self._full_name = value.decode(self.encoding)

486 self.parse_full_name()

487

488 def collapse_whitespace(self, string):

489 # collapse multiple spaces into single space

490 string = self.C.regexes.spaces.sub(" ", string.strip())

491 if string.endswith(","):

492 string = string[:-1]

493 return string

494

495 def pre_process(self):

496 """

497

498 This method happens at the beginning of the :py:func:`parse_full_name`

499 before any other processing of the string aside from unicode

500 normalization, so it's a good place to do any custom handling in a

501 subclass. Runs :py:func:`parse_nicknames` and :py:func:`squash_emoji`.

502

503 """

504 self.fix_phd()

505 self.parse_nicknames()

506 self.squash_emoji()

507

508 def post_process(self):

509 """

510 This happens at the end of the :py:func:`parse_full_name` after

511 all other processing has taken place. Runs :py:func:`handle_firstnames`

512 and :py:func:`handle_capitalization`.

513 """

514 self.handle_firstnames()

515 self.handle_capitalization()

516

517 def fix_phd(self):

518 try:

519 _re = self.C.regexes.phd

520 match = _re.search(self._full_name)

521 if match:

522 self.suffix_list.append(match.group(1))

523 self._full_name = _re.sub('', self._full_name)

524 except AttributeError:

525 pass

526

527 def parse_nicknames(self):

528 """

529 The content of parenthesis or quotes in the name will be added to the

530 nicknames list. This happens before any other processing of the name.

531

532 Single quotes cannot span white space characters and must border

533 white space to allow for quotes in names like O'Connor and Kawai'ae'a.

534 Double quotes and parenthesis can span white space.

535

536 Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`;

537 `quoted_word`, `double_quotes` and `parenthesis`.

538 """

539

540 empty_re = re.compile("")

541

542 re_quoted_word = self.C.regexes.quoted_word or empty_re

543 re_double_quotes = self.C.regexes.double_quotes or empty_re

544 re_parenthesis = self.C.regexes.parenthesis or empty_re

545

546 for _re in (re_quoted_word, re_double_quotes, re_parenthesis):

547 if _re.search(self._full_name):

548 self.nickname_list += [x for x in _re.findall(self._full_name)]

549 self._full_name = _re.sub('', self._full_name)

550

551 def squash_emoji(self):

552 """

553 Remove emoji from the input string.

554 """

555 re_emoji = self.C.regexes.emoji

556 if re_emoji and re_emoji.search(self._full_name):

557 self._full_name = re_emoji.sub('', self._full_name)

558

559 def handle_firstnames(self):

560 """

561 If there are only two parts and one is a title, assume it's a last name

562 instead of a first name. e.g. Mr. Johnson. Unless it's a special title

563 like "Sir", then when it's followed by a single name that name is always

564 a first name.

565 """

566 if self.title \

567 and len(self) == 2 \

568 and not lc(self.title) in self.C.first_name_titles:

569 self.last, self.first = self.first, self.last

570

571 def parse_full_name(self):

572 """

573

574 The main parse method for the parser. This method is run upon

575 assignment to the :py:attr:`full_name` attribute or instantiation.

576

577 Basic flow is to hand off to :py:func:`pre_process` to handle

578 nicknames. It then splits on commas and chooses a code path depending

579 on the number of commas.

580

581 :py:func:`parse_pieces` then splits those parts on spaces and

582 :py:func:`join_on_conjunctions` joins any pieces next to conjunctions.

583 """

584

585 self.title_list = []

586 self.first_list = []

587 self.middle_list = []

588 self.last_list = []

589 self.suffix_list = []

590 self.nickname_list = []

591 self.unparsable = True

592

593 self.pre_process()

594

595 self._full_name = self.collapse_whitespace(self._full_name)

596

597 # break up full_name by commas

598 parts = [x.strip() for x in self._full_name.split(",")]

599

600 log.debug("full_name: %s", self._full_name)

601 log.debug("parts: %s", parts)

602

603 if len(parts) == 1:

604

605 # no commas, title first middle middle middle last suffix

606 # part[0]

607

608 pieces = self.parse_pieces(parts)

609 p_len = len(pieces)

610 for i, piece in enumerate(pieces):

611 try:

612 nxt = pieces[i + 1]

613 except IndexError:

614 nxt = None

615

616 # title must have a next piece, unless it's just a title

617 if not self.first \

618 and (nxt or p_len == 1) \

619 and self.is_title(piece):

620 self.title_list.append(piece)

621 continue

622 if not self.first:

623 if p_len == 1 and self.nickname:

624 self.last_list.append(piece)

625 continue

626 self.first_list.append(piece)

627 continue

628 if self.are_suffixes(pieces[i+1:]) or \

629 (

630 # if the next piece is the last piece and a roman

631 # numeral but this piece is not an initial

632 self.is_roman_numeral(nxt) and i == p_len - 2

633 and not self.is_an_initial(piece)

634 ):

635 self.last_list.append(piece)

636 self.suffix_list += pieces[i+1:]

637 break

638 if not nxt:

639 self.last_list.append(piece)

640 continue

641

642 self.middle_list.append(piece)

643 else:

644 # if all the end parts are suffixes and there is more than one piece

645 # in the first part. (Suffixes will never appear after last names

646 # only, and allows potential first names to be in suffixes, e.g.

647 # "Johnson, Bart"

648

649 post_comma_pieces = self.parse_pieces(parts[1].split(' '), 1)

650

651 if self.are_suffixes(parts[1].split(' ')) \

652 and len(parts[0].split(' ')) > 1:

653

654 # suffix comma:

655 # title first middle last [suffix], suffix [suffix] [, suffix]

656 # parts[0], parts[1:...]

657

658 self.suffix_list += parts[1:]

659 pieces = self.parse_pieces(parts[0].split(' '))

660 log.debug("pieces: %s", u(pieces))

661 for i, piece in enumerate(pieces):

662 try:

663 nxt = pieces[i + 1]

664 except IndexError:

665 nxt = None

666

667 if not self.first \

668 and (nxt or len(pieces) == 1) \

669 and self.is_title(piece):

670 self.title_list.append(piece)

671 continue

672 if not self.first:

673 self.first_list.append(piece)

674 continue

675 if self.are_suffixes(pieces[i+1:]):

676 self.last_list.append(piece)

677 self.suffix_list = pieces[i+1:] + self.suffix_list

678 break

679 if not nxt:

680 self.last_list.append(piece)

681 continue

682 self.middle_list.append(piece)

683 else:

684

685 # lastname comma:

686 # last [suffix], title first middles[,] suffix [,suffix]

687 # parts[0], parts[1], parts[2:...]

688

689 log.debug("post-comma pieces: %s", u(post_comma_pieces))

690

691 # lastname part may have suffixes in it

692 lastname_pieces = self.parse_pieces(parts[0].split(' '), 1)

693 for piece in lastname_pieces:

694 # the first one is always a last name, even if it looks like

695 # a suffix

696 if self.is_suffix(piece) and len(self.last_list) > 0:

697 self.suffix_list.append(piece)

698 else:

699 self.last_list.append(piece)

700

701 for i, piece in enumerate(post_comma_pieces):

702 try:

703 nxt = post_comma_pieces[i + 1]

704 except IndexError:

705 nxt = None

706

707 if not self.first \

708 and (nxt or len(post_comma_pieces) == 1) \

709 and self.is_title(piece):

710 self.title_list.append(piece)

711 continue

712 if not self.first:

713 self.first_list.append(piece)

714 continue

715 if self.is_suffix(piece):

716 self.suffix_list.append(piece)

717 continue

718 self.middle_list.append(piece)

719 try:

720 if parts[2]:

721 self.suffix_list += parts[2:]

722 except IndexError:

723 pass

724

725 if len(self) < 0:

726 log.info("Unparsable: \"%s\" ", self.original)

727 else:

728 self.unparsable = False

729 self.post_process()

730

731 def parse_pieces(self, parts, additional_parts_count=0):

732 """

733 Split parts on spaces and remove commas, join on conjunctions and

734 lastname prefixes. If parts have periods in the middle, try splitting

735 on periods and check if the parts are titles or suffixes. If they are

736 add to the constant so they will be found.

737

738 :param list parts: name part strings from the comma split

739 :param int additional_parts_count:

740

741 if the comma format contains other parts, we need to know

742 how many there are to decide if things should be considered a

743 conjunction.

744 :return: pieces split on spaces and joined on conjunctions

745 :rtype: list

746 """

747

748 output = []

749 for part in parts:

750 if not isinstance(part, text_types):

751 raise TypeError("Name parts must be strings. "

752 "Got {0}".format(type(part)))

753 output += [x.strip(' ,') for x in part.split(' ')]

754

755 # If part contains periods, check if it's multiple titles or suffixes

756 # together without spaces if so, add the new part with periods to the

757 # constants so they get parsed correctly later

758 for part in output:

759 # if this part has a period not at the beginning or end

760 if self.C.regexes.period_not_at_end and self.C.regexes.period_not_at_end.match(part):

761 # split on periods, any of the split pieces titles or suffixes?

762 # ("Lt.Gov.")

763 period_chunks = part.split(".")

764 titles = list(filter(self.is_title, period_chunks))

765 suffixes = list(filter(self.is_suffix, period_chunks))

766

767 # add the part to the constant so it will be found

768 if len(list(titles)):

769 self.C.titles.add(part)

770 continue

771 if len(list(suffixes)):

772 self.C.suffix_not_acronyms.add(part)

773 continue

774

775 return self.join_on_conjunctions(output, additional_parts_count)

776

777 def join_on_conjunctions(self, pieces, additional_parts_count=0):

778 """

779 Join conjunctions to surrounding pieces. Title- and prefix-aware. e.g.:

780

781 ['Mr.', 'and'. 'Mrs.', 'John', 'Doe'] ==>

782 ['Mr. and Mrs.', 'John', 'Doe']

783

784 ['The', 'Secretary', 'of', 'State', 'Hillary', 'Clinton'] ==>

785 ['The Secretary of State', 'Hillary', 'Clinton']

786

787 When joining titles, saves newly formed piece to the instance's titles

788 constant so they will be parsed correctly later. E.g. after parsing the

789 example names above, 'The Secretary of State' and 'Mr. and Mrs.' would

790 be present in the titles constant set.

791

792 :param list pieces: name pieces strings after split on spaces

793 :param int additional_parts_count:

794 :return: new list with piece next to conjunctions merged into one piece

795 with spaces in it.

796 :rtype: list

797

798 """

799 length = len(pieces) + additional_parts_count

800 # don't join on conjunctions if there's only 2 parts

801 if length < 3:

802 return pieces

803

804 rootname_pieces = [p for p in pieces if self.is_rootname(p)]

805 total_length = len(rootname_pieces) + additional_parts_count

806

807 # find all the conjunctions, join any conjunctions that are next to each

808 # other, then join those newly joined conjunctions and any single

809 # conjunctions to the piece before and after it

810 conj_index = [i for i, piece in enumerate(pieces)

811 if self.is_conjunction(piece)]

812

813 contiguous_conj_i = []

814 for i, val in enumerate(conj_index):

815 try:

816 if conj_index[i+1] == val+1:

817 contiguous_conj_i += [val]

818 except IndexError:

819 pass

820

821 contiguous_conj_i = group_contiguous_integers(conj_index)

822

823 delete_i = []

824 for i in contiguous_conj_i:

825 if type(i) == tuple:

826 new_piece = " ".join(pieces[i[0]: i[1]+1])

827 delete_i += list(range(i[0]+1, i[1]+1))

828 pieces[i[0]] = new_piece

829 else:

830 new_piece = " ".join(pieces[i: i+2])

831 delete_i += [i+1]

832 pieces[i] = new_piece

833 # add newly joined conjunctions to constants to be found later

834 self.C.conjunctions.add(new_piece)

835

836 for i in reversed(delete_i):

837 # delete pieces in reverse order or the index changes on each delete

838 del pieces[i]

839

840 if len(pieces) == 1:

841 # if there's only one piece left, nothing left to do

842 return pieces

843

844 # refresh conjunction index locations

845 conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)]

846

847 for i in conj_index:

848 if len(pieces[i]) == 1 and total_length < 4:

849 # if there are only 3 total parts (minus known titles, suffixes

850 # and prefixes) and this conjunction is a single letter, prefer

851 # treating it as an initial rather than a conjunction.

852 # http://code.google.com/p/python-nameparser/issues/detail?id=11

853 continue

854

855 if i == 0:

856 new_piece = " ".join(pieces[i:i+2])

857 if self.is_title(pieces[i+1]):

858 # when joining to a title, make new_piece a title too

859 self.C.titles.add(new_piece)

860 pieces[i] = new_piece

861 pieces.pop(i+1)

862 # subtract 1 from the index of all the remaining conjunctions

863 for j, val in enumerate(conj_index):

864 if val > i:

865 conj_index[j] = val-1

866

867 else:

868 new_piece = " ".join(pieces[i-1:i+2])

869 if self.is_title(pieces[i-1]):

870 # when joining to a title, make new_piece a title too

871 self.C.titles.add(new_piece)

872 pieces[i-1] = new_piece

873 pieces.pop(i)

874 rm_count = 2

875 try:

876 pieces.pop(i)

877 except IndexError:

878 rm_count = 1

879

880 # subtract the number of removed pieces from the index

881 # of all the remaining conjunctions

882 for j, val in enumerate(conj_index):

883 if val > i:

884 conj_index[j] = val - rm_count

885

886 # join prefixes to following lastnames: ['de la Vega'], ['van Buren']

887 prefixes = list(filter(self.is_prefix, pieces))

888 if prefixes:

889 for prefix in prefixes:

890 try:

891 i = pieces.index(prefix)

892 except ValueError:

893 # If the prefix is no longer in pieces, it's because it has been

894 # combined with the prefix that appears right before (or before that when

895 # chained together) in the last loop, so the index of that newly created

896 # piece is the same as in the last loop, i==i still, and we want to join

897 # it to the next piece.

898 pass

899

900 new_piece = ''

901

902 # join everything after the prefix until the next prefix or suffix

903

904 try:

905 if i == 0 and total_length >= 1:

906 # If it's the first piece and there are more than 1 rootnames, assume it's a first name

907 continue

908 next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:])))

909 j = pieces.index(next_prefix)

910 if j == i + 1:

911 # if there are two prefixes in sequence, join to the following piece

912 j += 1

913 new_piece = ' '.join(pieces[i:j])

914 pieces = pieces[:i] + [new_piece] + pieces[j:]

915 except StopIteration:

916 try:

917 # if there are no more prefixes, look for a suffix to stop at

918 stop_at = next(iter(filter(self.is_suffix, pieces[i + 1:])))

919 j = pieces.index(stop_at)

920 new_piece = ' '.join(pieces[i:j])

921 pieces = pieces[:i] + [new_piece] + pieces[j:]

922 except StopIteration:

923 # if there were no suffixes, nothing to stop at so join all

924 # remaining pieces

925 new_piece = ' '.join(pieces[i:])

926 pieces = pieces[:i] + [new_piece]

927

928 log.debug("pieces: %s", pieces)

929 return pieces

930

931 # Capitalization Support

932

933 def cap_word(self, word, attribute):

934 if (self.is_prefix(word) and attribute in ('last', 'middle')) \

935 or self.is_conjunction(word):

936 return word.lower()

937 exceptions = self.C.capitalization_exceptions

938 if lc(word) in exceptions:

939 return exceptions[lc(word)]

940 mac_match = self.C.regexes.mac.match(word)

941 if mac_match:

942 def cap_after_mac(m):

943 return m.group(1).capitalize() + m.group(2).capitalize()

944 return self.C.regexes.mac.sub(cap_after_mac, word)

945 else:

946 return word.capitalize()

947

948 def cap_piece(self, piece, attribute):

949 if not piece:

950 return ""

951

952 def replacement(m): return self.cap_word(m.group(0), attribute)

953 return self.C.regexes.word.sub(replacement, piece)

954

955 def capitalize(self, force=None):

956 """

957 The HumanName class can try to guess the correct capitalization of name

958 entered in all upper or lower case. By default, it will not adjust the

959 case of names entered in mixed case. To run capitalization on all names

960 pass the parameter `force=True`.

961

962 :param bool force: Forces capitalization of mixed case strings. This

963 parameter overrides rules set within

964 :py:class:`~nameparser.config.CONSTANTS`.

965

966 **Usage**

967

968 .. doctest:: capitalize

969

970 >>> name = HumanName('bob v. de la macdole-eisenhower phd')

971 >>> name.capitalize()

972 >>> str(name)

973 'Bob V. de la MacDole-Eisenhower Ph.D.'

974 >>> # Don't touch good names

975 >>> name = HumanName('Shirley Maclaine')

976 >>> name.capitalize()

977 >>> str(name)

978 'Shirley Maclaine'

979 >>> name.capitalize(force=True)

980 >>> str(name)

981 'Shirley MacLaine'

982

983 """

984 name = u(self)

985 force = self.C.force_mixed_case_capitalization \

986 if force is None else force

987

988 if not force and not (name == name.upper() or name == name.lower()):

989 return

990 self.title_list = self.cap_piece(self.title, 'title').split(' ')

991 self.first_list = self.cap_piece(self.first, 'first').split(' ')

992 self.middle_list = self.cap_piece(self.middle, 'middle').split(' ')

993 self.last_list = self.cap_piece(self.last, 'last').split(' ')

994 self.suffix_list = self.cap_piece(self.suffix, 'suffix').split(', ')

995

996 def handle_capitalization(self):

997 """

998 Handles capitalization configurations set within

999 :py:class:`~nameparser.config.CONSTANTS`.

1000 """

1001 if self.C.capitalize_name:

1002 self.capitalize()