Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/nameparser/parser.py: 79%
477 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:08 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:08 +0000
1# -*- coding: utf-8 -*-
2from __future__ import unicode_literals
4import sys
5import re
6from operator import itemgetter
7from itertools import groupby
9from nameparser.util import u
10from nameparser.util import text_types, binary_type
11from nameparser.util import lc
12from nameparser.util import log
13from nameparser.config import CONSTANTS
14from nameparser.config import Constants
15from nameparser.config import DEFAULT_ENCODING
17ENCODING = 'utf-8'
20def group_contiguous_integers(data):
21 """
22 return list of tuples containing first and last index
23 position of contiguous numbers in a series
24 """
25 ranges = []
26 for key, group in groupby(enumerate(data), lambda i: i[0] - i[1]):
27 group = list(map(itemgetter(1), group))
28 if len(group) > 1:
29 ranges.append((group[0], group[-1]))
30 return ranges
33class HumanName(object):
34 """
35 Parse a person's name into individual components.
37 Instantiation assigns to ``full_name``, and assignment to
38 :py:attr:`full_name` triggers :py:func:`parse_full_name`. After parsing the
39 name, these instance attributes are available. Alternatively, you can pass
40 any of the instance attributes to the constructor method and skip the parsing
41 process. If any of the the instance attributes are passed to the constructor
42 as keywords, :py:func:`parse_full_name` will not be performed.
44 **HumanName Instance Attributes**
46 * :py:attr:`title`
47 * :py:attr:`first`
48 * :py:attr:`middle`
49 * :py:attr:`last`
50 * :py:attr:`suffix`
51 * :py:attr:`nickname`
52 * :py:attr:`surnames`
54 :param str full_name: The name string to be parsed.
55 :param constants constants:
56 a :py:class:`~nameparser.config.Constants` instance. Pass ``None`` for
57 `per-instance config <customize.html>`_.
58 :param str encoding: string representing the encoding of your input
59 :param str string_format: python string formatting
60 :param str initials_format: python initials string formatting
61 :param str initials_delimter: string delimiter for initials
62 :param str first: first name
63 :param str middle: middle name
64 :param str last: last name
65 :param str title: The title or prenominal
66 :param str suffix: The suffix or postnominal
67 :param str nickname: Nicknames
68 """
70 C = CONSTANTS
71 """
72 A reference to the configuration for this instance, which may or may not be
73 a reference to the shared, module-wide instance at
74 :py:mod:`~nameparser.config.CONSTANTS`. See `Customizing the Parser
75 <customize.html>`_.
76 """
78 original = ''
79 """
80 The original string, untouched by the parser.
81 """
83 _count = 0
84 _members = ['title', 'first', 'middle', 'last', 'suffix', 'nickname']
85 unparsable = True
86 _full_name = ''
88 def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING,
89 string_format=None, initials_format=None, initials_delimiter=None,
90 first=None, middle=None, last=None, title=None, suffix=None,
91 nickname=None):
92 self.C = constants
93 if type(self.C) is not type(CONSTANTS):
94 self.C = Constants()
96 self.encoding = encoding
97 self.string_format = string_format or self.C.string_format
98 self.initials_format = initials_format or self.C.initials_format
99 self.initials_delimiter = initials_delimiter or self.C.initials_delimiter
100 if (first or middle or last or title or suffix or nickname):
101 self.first = first
102 self.middle = middle
103 self.last = last
104 self.title = title
105 self.suffix = suffix
106 self.nickname = nickname
107 self.unparsable = False
108 else:
109 # full_name setter triggers the parse
110 self.full_name = full_name
112 def __iter__(self):
113 return self
115 def __len__(self):
116 l = 0
117 for x in self:
118 l += 1
119 return l
121 def __eq__(self, other):
122 """
123 HumanName instances are equal to other objects whose
124 lower case unicode representation is the same.
125 """
126 return (u(self)).lower() == (u(other)).lower()
128 def __ne__(self, other):
129 return not (u(self)).lower() == (u(other)).lower()
131 def __getitem__(self, key):
132 if isinstance(key, slice):
133 return [getattr(self, x) for x in self._members[key]]
134 else:
135 return getattr(self, key)
137 def __setitem__(self, key, value):
138 if key in self._members:
139 self._set_list(key, value)
140 else:
141 raise KeyError("Not a valid HumanName attribute", key)
143 def next(self):
144 return self.__next__()
146 def __next__(self):
147 if self._count >= len(self._members):
148 self._count = 0
149 raise StopIteration
150 else:
151 c = self._count
152 self._count = c + 1
153 return getattr(self, self._members[c]) or next(self)
155 def __unicode__(self):
156 if self.string_format:
157 # string_format = "{title} {first} {middle} {last} {suffix} ({nickname})"
158 _s = self.string_format.format(**self.as_dict())
159 # remove trailing punctuation from missing nicknames
160 _s = _s.replace(str(self.C.empty_attribute_default), '').replace(" ()", "").replace(" ''", "").replace(' ""', "")
161 return self.collapse_whitespace(_s).strip(', ')
162 return " ".join(self)
164 def __hash__(self):
165 return hash(str(self))
167 def __str__(self):
168 if sys.version_info[0] >= 3:
169 return self.__unicode__()
170 return self.__unicode__().encode(self.encoding)
172 def __repr__(self):
173 if self.unparsable:
174 _string = "<%(class)s : [ Unparsable ] >" % {'class': self.__class__.__name__, }
175 else:
176 _string = "<%(class)s : [\n\ttitle: %(title)r \n\tfirst: %(first)r \n\tmiddle: %(middle)r \n\tlast: %(last)r \n\tsuffix: %(suffix)r\n\tnickname: %(nickname)r\n]>" % {
177 'class': self.__class__.__name__,
178 'title': self.title or '',
179 'first': self.first or '',
180 'middle': self.middle or '',
181 'last': self.last or '',
182 'suffix': self.suffix or '',
183 'nickname': self.nickname or '',
184 }
185 if sys.version_info[0] >= 3:
186 return _string
187 return _string.encode(self.encoding)
189 def as_dict(self, include_empty=True):
190 """
191 Return the parsed name as a dictionary of its attributes.
193 :param bool include_empty: Include keys in the dictionary for empty name attributes.
194 :rtype: dict
196 .. doctest::
198 >>> name = HumanName("Bob Dole")
199 >>> name.as_dict()
200 {'last': 'Dole', 'suffix': '', 'title': '', 'middle': '', 'nickname': '', 'first': 'Bob'}
201 >>> name.as_dict(False)
202 {'last': 'Dole', 'first': 'Bob'}
204 """
205 d = {}
206 for m in self._members:
207 if include_empty:
208 d[m] = getattr(self, m)
209 else:
210 val = getattr(self, m)
211 if val:
212 d[m] = val
213 return d
215 def __process_initial__(self, name_part, firstname=False):
216 """
217 Name parts may include prefixes or conjunctions. This function filters these from the name unless it is
218 a first name, since first names cannot be conjunctions or prefixes.
219 """
220 parts = name_part.split(" ")
221 initials = []
222 if len(parts) and isinstance(parts, list):
223 for part in parts:
224 if not (self.is_prefix(part) or self.is_conjunction(part)) or firstname == True:
225 initials.append(part[0])
226 if len(initials) > 0:
227 return " ".join(initials)
228 else:
229 return self.C.empty_attribute_default
231 def initials_list(self):
232 """
233 Returns the initials as a list
235 .. doctest::
237 >>> name = HumanName("Sir Bob Andrew Dole")
238 >>> name.initials_list()
239 ["B", "A", "D"]
240 >>> name = HumanName("J. Doe")
241 >>> name.initials_list()
242 ["J", "D"]
243 """
244 first_initials_list = [self.__process_initial__(name, True) for name in self.first_list if name]
245 middle_initials_list = [self.__process_initial__(name) for name in self.middle_list if name]
246 last_initials_list = [self.__process_initial__(name) for name in self.last_list if name]
247 return first_initials_list + middle_initials_list + last_initials_list
249 def initials(self):
250 """
251 Return period-delimited initials of the first, middle and optionally last name.
253 :param bool include_last_name: Include the last name as part of the initials
254 :rtype: str
256 .. doctest::
258 >>> name = HumanName("Sir Bob Andrew Dole")
259 >>> name.initials()
260 "B. A. D."
261 >>> name = HumanName("Sir Bob Andrew Dole", initials_format="{first} {middle}")
262 >>> name.initials()
263 "B. A."
264 """
266 first_initials_list = [self.__process_initial__(name, True) for name in self.first_list if name]
267 middle_initials_list = [self.__process_initial__(name) for name in self.middle_list if name]
268 last_initials_list = [self.__process_initial__(name) for name in self.last_list if name]
270 initials_dict = {
271 "first": (self.initials_delimiter + " ").join(first_initials_list) + self.initials_delimiter
272 if len(first_initials_list) else self.C.empty_attribute_default,
273 "middle": (self.initials_delimiter + " ").join(middle_initials_list) + self.initials_delimiter
274 if len(middle_initials_list) else self.C.empty_attribute_default,
275 "last": (self.initials_delimiter + " ").join(last_initials_list) + self.initials_delimiter
276 if len(last_initials_list) else self.C.empty_attribute_default
277 }
279 _s = self.initials_format.format(**initials_dict)
280 return self.collapse_whitespace(_s)
282 @property
283 def has_own_config(self):
284 """
285 True if this instance is not using the shared module-level
286 configuration.
287 """
288 return self.C is not CONSTANTS
290 # attributes
292 @property
293 def title(self):
294 """
295 The person's titles. Any string of consecutive pieces in
296 :py:mod:`~nameparser.config.titles` or
297 :py:mod:`~nameparser.config.conjunctions`
298 at the beginning of :py:attr:`full_name`.
299 """
300 return " ".join(self.title_list) or self.C.empty_attribute_default
302 @property
303 def first(self):
304 """
305 The person's first name. The first name piece after any known
306 :py:attr:`title` pieces parsed from :py:attr:`full_name`.
307 """
308 return " ".join(self.first_list) or self.C.empty_attribute_default
310 @property
311 def middle(self):
312 """
313 The person's middle names. All name pieces after the first name and
314 before the last name parsed from :py:attr:`full_name`.
315 """
316 return " ".join(self.middle_list) or self.C.empty_attribute_default
318 @property
319 def last(self):
320 """
321 The person's last name. The last name piece parsed from
322 :py:attr:`full_name`.
323 """
324 return " ".join(self.last_list) or self.C.empty_attribute_default
326 @property
327 def suffix(self):
328 """
329 The persons's suffixes. Pieces at the end of the name that are found in
330 :py:mod:`~nameparser.config.suffixes`, or pieces that are at the end
331 of comma separated formats, e.g.
332 "Lastname, Title Firstname Middle[,] Suffix [, Suffix]" parsed
333 from :py:attr:`full_name`.
334 """
335 return ", ".join(self.suffix_list) or self.C.empty_attribute_default
337 @property
338 def nickname(self):
339 """
340 The person's nicknames. Any text found inside of quotes (``""``) or
341 parenthesis (``()``)
342 """
343 return " ".join(self.nickname_list) or self.C.empty_attribute_default
345 @property
346 def surnames_list(self):
347 """
348 List of middle names followed by last name.
349 """
350 return self.middle_list + self.last_list
352 @property
353 def surnames(self):
354 """
355 A string of all middle names followed by the last name.
356 """
357 return " ".join(self.surnames_list) or self.C.empty_attribute_default
359 # setter methods
361 def _set_list(self, attr, value):
362 if isinstance(value, list):
363 val = value
364 elif isinstance(value, text_types):
365 val = [value]
366 elif value is None:
367 val = []
368 else:
369 raise TypeError(
370 "Can only assign strings, lists or None to name attributes."
371 " Got {0}".format(type(value)))
372 setattr(self, attr+"_list", self.parse_pieces(val))
374 @title.setter
375 def title(self, value):
376 self._set_list('title', value)
378 @first.setter
379 def first(self, value):
380 self._set_list('first', value)
382 @middle.setter
383 def middle(self, value):
384 self._set_list('middle', value)
386 @last.setter
387 def last(self, value):
388 self._set_list('last', value)
390 @suffix.setter
391 def suffix(self, value):
392 self._set_list('suffix', value)
394 @nickname.setter
395 def nickname(self, value):
396 self._set_list('nickname', value)
398 # Parse helpers
400 def is_title(self, value):
401 """Is in the :py:data:`~nameparser.config.titles.TITLES` set."""
402 return lc(value) in self.C.titles
404 def is_conjunction(self, piece):
405 """Is in the conjunctions set and not :py:func:`is_an_initial()`."""
406 if isinstance(piece, list):
407 for item in piece:
408 if self.is_conjunction(item):
409 return True
410 else:
411 return piece.lower() in self.C.conjunctions and not self.is_an_initial(piece)
413 def is_prefix(self, piece):
414 """
415 Lowercase and no periods version of piece is in the
416 :py:data:`~nameparser.config.prefixes.PREFIXES` set.
417 """
418 if isinstance(piece, list):
419 for item in piece:
420 if self.is_prefix(item):
421 return True
422 else:
423 return lc(piece) in self.C.prefixes
425 def is_roman_numeral(self, value):
426 """
427 Matches the ``roman_numeral`` regular expression in
428 :py:data:`~nameparser.config.regexes.REGEXES`.
429 """
430 return bool(self.C.regexes.roman_numeral.match(value))
432 def is_suffix(self, piece):
433 """
434 Is in the suffixes set and not :py:func:`is_an_initial()`.
436 Some suffixes may be acronyms (M.B.A) while some are not (Jr.),
437 so we remove the periods from `piece` when testing against
438 `C.suffix_acronyms`.
439 """
440 # suffixes may have periods inside them like "M.D."
441 if isinstance(piece, list):
442 for item in piece:
443 if self.is_suffix(item):
444 return True
445 else:
446 return ((lc(piece).replace('.', '') in self.C.suffix_acronyms)
447 or (lc(piece) in self.C.suffix_not_acronyms)) \
448 and not self.is_an_initial(piece)
450 def are_suffixes(self, pieces):
451 """Return True if all pieces are suffixes."""
452 for piece in pieces:
453 if not self.is_suffix(piece):
454 return False
455 return True
457 def is_rootname(self, piece):
458 """
459 Is not a known title, suffix or prefix. Just first, middle, last names.
460 """
461 return lc(piece) not in self.C.suffixes_prefixes_titles \
462 and not self.is_an_initial(piece)
464 def is_an_initial(self, value):
465 """
466 Words with a single period at the end, or a single uppercase letter.
468 Matches the ``initial`` regular expression in
469 :py:data:`~nameparser.config.regexes.REGEXES`.
470 """
471 return bool(self.C.regexes.initial.match(value))
473 # full_name parser
475 @property
476 def full_name(self):
477 """The string output of the HumanName instance."""
478 return self.__str__()
480 @full_name.setter
481 def full_name(self, value):
482 self.original = value
483 self._full_name = value
484 if isinstance(value, binary_type):
485 self._full_name = value.decode(self.encoding)
486 self.parse_full_name()
488 def collapse_whitespace(self, string):
489 # collapse multiple spaces into single space
490 string = self.C.regexes.spaces.sub(" ", string.strip())
491 if string.endswith(","):
492 string = string[:-1]
493 return string
495 def pre_process(self):
496 """
498 This method happens at the beginning of the :py:func:`parse_full_name`
499 before any other processing of the string aside from unicode
500 normalization, so it's a good place to do any custom handling in a
501 subclass. Runs :py:func:`parse_nicknames` and :py:func:`squash_emoji`.
503 """
504 self.fix_phd()
505 self.parse_nicknames()
506 self.squash_emoji()
508 def post_process(self):
509 """
510 This happens at the end of the :py:func:`parse_full_name` after
511 all other processing has taken place. Runs :py:func:`handle_firstnames`
512 and :py:func:`handle_capitalization`.
513 """
514 self.handle_firstnames()
515 self.handle_capitalization()
517 def fix_phd(self):
518 try:
519 _re = self.C.regexes.phd
520 match = _re.search(self._full_name)
521 if match:
522 self.suffix_list.append(match.group(1))
523 self._full_name = _re.sub('', self._full_name)
524 except AttributeError:
525 pass
527 def parse_nicknames(self):
528 """
529 The content of parenthesis or quotes in the name will be added to the
530 nicknames list. This happens before any other processing of the name.
532 Single quotes cannot span white space characters and must border
533 white space to allow for quotes in names like O'Connor and Kawai'ae'a.
534 Double quotes and parenthesis can span white space.
536 Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`;
537 `quoted_word`, `double_quotes` and `parenthesis`.
538 """
540 empty_re = re.compile("")
542 re_quoted_word = self.C.regexes.quoted_word or empty_re
543 re_double_quotes = self.C.regexes.double_quotes or empty_re
544 re_parenthesis = self.C.regexes.parenthesis or empty_re
546 for _re in (re_quoted_word, re_double_quotes, re_parenthesis):
547 if _re.search(self._full_name):
548 self.nickname_list += [x for x in _re.findall(self._full_name)]
549 self._full_name = _re.sub('', self._full_name)
551 def squash_emoji(self):
552 """
553 Remove emoji from the input string.
554 """
555 re_emoji = self.C.regexes.emoji
556 if re_emoji and re_emoji.search(self._full_name):
557 self._full_name = re_emoji.sub('', self._full_name)
559 def handle_firstnames(self):
560 """
561 If there are only two parts and one is a title, assume it's a last name
562 instead of a first name. e.g. Mr. Johnson. Unless it's a special title
563 like "Sir", then when it's followed by a single name that name is always
564 a first name.
565 """
566 if self.title \
567 and len(self) == 2 \
568 and not lc(self.title) in self.C.first_name_titles:
569 self.last, self.first = self.first, self.last
571 def parse_full_name(self):
572 """
574 The main parse method for the parser. This method is run upon
575 assignment to the :py:attr:`full_name` attribute or instantiation.
577 Basic flow is to hand off to :py:func:`pre_process` to handle
578 nicknames. It then splits on commas and chooses a code path depending
579 on the number of commas.
581 :py:func:`parse_pieces` then splits those parts on spaces and
582 :py:func:`join_on_conjunctions` joins any pieces next to conjunctions.
583 """
585 self.title_list = []
586 self.first_list = []
587 self.middle_list = []
588 self.last_list = []
589 self.suffix_list = []
590 self.nickname_list = []
591 self.unparsable = True
593 self.pre_process()
595 self._full_name = self.collapse_whitespace(self._full_name)
597 # break up full_name by commas
598 parts = [x.strip() for x in self._full_name.split(",")]
600 log.debug("full_name: %s", self._full_name)
601 log.debug("parts: %s", parts)
603 if len(parts) == 1:
605 # no commas, title first middle middle middle last suffix
606 # part[0]
608 pieces = self.parse_pieces(parts)
609 p_len = len(pieces)
610 for i, piece in enumerate(pieces):
611 try:
612 nxt = pieces[i + 1]
613 except IndexError:
614 nxt = None
616 # title must have a next piece, unless it's just a title
617 if not self.first \
618 and (nxt or p_len == 1) \
619 and self.is_title(piece):
620 self.title_list.append(piece)
621 continue
622 if not self.first:
623 if p_len == 1 and self.nickname:
624 self.last_list.append(piece)
625 continue
626 self.first_list.append(piece)
627 continue
628 if self.are_suffixes(pieces[i+1:]) or \
629 (
630 # if the next piece is the last piece and a roman
631 # numeral but this piece is not an initial
632 self.is_roman_numeral(nxt) and i == p_len - 2
633 and not self.is_an_initial(piece)
634 ):
635 self.last_list.append(piece)
636 self.suffix_list += pieces[i+1:]
637 break
638 if not nxt:
639 self.last_list.append(piece)
640 continue
642 self.middle_list.append(piece)
643 else:
644 # if all the end parts are suffixes and there is more than one piece
645 # in the first part. (Suffixes will never appear after last names
646 # only, and allows potential first names to be in suffixes, e.g.
647 # "Johnson, Bart"
649 post_comma_pieces = self.parse_pieces(parts[1].split(' '), 1)
651 if self.are_suffixes(parts[1].split(' ')) \
652 and len(parts[0].split(' ')) > 1:
654 # suffix comma:
655 # title first middle last [suffix], suffix [suffix] [, suffix]
656 # parts[0], parts[1:...]
658 self.suffix_list += parts[1:]
659 pieces = self.parse_pieces(parts[0].split(' '))
660 log.debug("pieces: %s", u(pieces))
661 for i, piece in enumerate(pieces):
662 try:
663 nxt = pieces[i + 1]
664 except IndexError:
665 nxt = None
667 if not self.first \
668 and (nxt or len(pieces) == 1) \
669 and self.is_title(piece):
670 self.title_list.append(piece)
671 continue
672 if not self.first:
673 self.first_list.append(piece)
674 continue
675 if self.are_suffixes(pieces[i+1:]):
676 self.last_list.append(piece)
677 self.suffix_list = pieces[i+1:] + self.suffix_list
678 break
679 if not nxt:
680 self.last_list.append(piece)
681 continue
682 self.middle_list.append(piece)
683 else:
685 # lastname comma:
686 # last [suffix], title first middles[,] suffix [,suffix]
687 # parts[0], parts[1], parts[2:...]
689 log.debug("post-comma pieces: %s", u(post_comma_pieces))
691 # lastname part may have suffixes in it
692 lastname_pieces = self.parse_pieces(parts[0].split(' '), 1)
693 for piece in lastname_pieces:
694 # the first one is always a last name, even if it looks like
695 # a suffix
696 if self.is_suffix(piece) and len(self.last_list) > 0:
697 self.suffix_list.append(piece)
698 else:
699 self.last_list.append(piece)
701 for i, piece in enumerate(post_comma_pieces):
702 try:
703 nxt = post_comma_pieces[i + 1]
704 except IndexError:
705 nxt = None
707 if not self.first \
708 and (nxt or len(post_comma_pieces) == 1) \
709 and self.is_title(piece):
710 self.title_list.append(piece)
711 continue
712 if not self.first:
713 self.first_list.append(piece)
714 continue
715 if self.is_suffix(piece):
716 self.suffix_list.append(piece)
717 continue
718 self.middle_list.append(piece)
719 try:
720 if parts[2]:
721 self.suffix_list += parts[2:]
722 except IndexError:
723 pass
725 if len(self) < 0:
726 log.info("Unparsable: \"%s\" ", self.original)
727 else:
728 self.unparsable = False
729 self.post_process()
731 def parse_pieces(self, parts, additional_parts_count=0):
732 """
733 Split parts on spaces and remove commas, join on conjunctions and
734 lastname prefixes. If parts have periods in the middle, try splitting
735 on periods and check if the parts are titles or suffixes. If they are
736 add to the constant so they will be found.
738 :param list parts: name part strings from the comma split
739 :param int additional_parts_count:
741 if the comma format contains other parts, we need to know
742 how many there are to decide if things should be considered a
743 conjunction.
744 :return: pieces split on spaces and joined on conjunctions
745 :rtype: list
746 """
748 output = []
749 for part in parts:
750 if not isinstance(part, text_types):
751 raise TypeError("Name parts must be strings. "
752 "Got {0}".format(type(part)))
753 output += [x.strip(' ,') for x in part.split(' ')]
755 # If part contains periods, check if it's multiple titles or suffixes
756 # together without spaces if so, add the new part with periods to the
757 # constants so they get parsed correctly later
758 for part in output:
759 # if this part has a period not at the beginning or end
760 if self.C.regexes.period_not_at_end and self.C.regexes.period_not_at_end.match(part):
761 # split on periods, any of the split pieces titles or suffixes?
762 # ("Lt.Gov.")
763 period_chunks = part.split(".")
764 titles = list(filter(self.is_title, period_chunks))
765 suffixes = list(filter(self.is_suffix, period_chunks))
767 # add the part to the constant so it will be found
768 if len(list(titles)):
769 self.C.titles.add(part)
770 continue
771 if len(list(suffixes)):
772 self.C.suffix_not_acronyms.add(part)
773 continue
775 return self.join_on_conjunctions(output, additional_parts_count)
777 def join_on_conjunctions(self, pieces, additional_parts_count=0):
778 """
779 Join conjunctions to surrounding pieces. Title- and prefix-aware. e.g.:
781 ['Mr.', 'and'. 'Mrs.', 'John', 'Doe'] ==>
782 ['Mr. and Mrs.', 'John', 'Doe']
784 ['The', 'Secretary', 'of', 'State', 'Hillary', 'Clinton'] ==>
785 ['The Secretary of State', 'Hillary', 'Clinton']
787 When joining titles, saves newly formed piece to the instance's titles
788 constant so they will be parsed correctly later. E.g. after parsing the
789 example names above, 'The Secretary of State' and 'Mr. and Mrs.' would
790 be present in the titles constant set.
792 :param list pieces: name pieces strings after split on spaces
793 :param int additional_parts_count:
794 :return: new list with piece next to conjunctions merged into one piece
795 with spaces in it.
796 :rtype: list
798 """
799 length = len(pieces) + additional_parts_count
800 # don't join on conjunctions if there's only 2 parts
801 if length < 3:
802 return pieces
804 rootname_pieces = [p for p in pieces if self.is_rootname(p)]
805 total_length = len(rootname_pieces) + additional_parts_count
807 # find all the conjunctions, join any conjunctions that are next to each
808 # other, then join those newly joined conjunctions and any single
809 # conjunctions to the piece before and after it
810 conj_index = [i for i, piece in enumerate(pieces)
811 if self.is_conjunction(piece)]
813 contiguous_conj_i = []
814 for i, val in enumerate(conj_index):
815 try:
816 if conj_index[i+1] == val+1:
817 contiguous_conj_i += [val]
818 except IndexError:
819 pass
821 contiguous_conj_i = group_contiguous_integers(conj_index)
823 delete_i = []
824 for i in contiguous_conj_i:
825 if type(i) == tuple:
826 new_piece = " ".join(pieces[i[0]: i[1]+1])
827 delete_i += list(range(i[0]+1, i[1]+1))
828 pieces[i[0]] = new_piece
829 else:
830 new_piece = " ".join(pieces[i: i+2])
831 delete_i += [i+1]
832 pieces[i] = new_piece
833 # add newly joined conjunctions to constants to be found later
834 self.C.conjunctions.add(new_piece)
836 for i in reversed(delete_i):
837 # delete pieces in reverse order or the index changes on each delete
838 del pieces[i]
840 if len(pieces) == 1:
841 # if there's only one piece left, nothing left to do
842 return pieces
844 # refresh conjunction index locations
845 conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)]
847 for i in conj_index:
848 if len(pieces[i]) == 1 and total_length < 4:
849 # if there are only 3 total parts (minus known titles, suffixes
850 # and prefixes) and this conjunction is a single letter, prefer
851 # treating it as an initial rather than a conjunction.
852 # http://code.google.com/p/python-nameparser/issues/detail?id=11
853 continue
855 if i == 0:
856 new_piece = " ".join(pieces[i:i+2])
857 if self.is_title(pieces[i+1]):
858 # when joining to a title, make new_piece a title too
859 self.C.titles.add(new_piece)
860 pieces[i] = new_piece
861 pieces.pop(i+1)
862 # subtract 1 from the index of all the remaining conjunctions
863 for j, val in enumerate(conj_index):
864 if val > i:
865 conj_index[j] = val-1
867 else:
868 new_piece = " ".join(pieces[i-1:i+2])
869 if self.is_title(pieces[i-1]):
870 # when joining to a title, make new_piece a title too
871 self.C.titles.add(new_piece)
872 pieces[i-1] = new_piece
873 pieces.pop(i)
874 rm_count = 2
875 try:
876 pieces.pop(i)
877 except IndexError:
878 rm_count = 1
880 # subtract the number of removed pieces from the index
881 # of all the remaining conjunctions
882 for j, val in enumerate(conj_index):
883 if val > i:
884 conj_index[j] = val - rm_count
886 # join prefixes to following lastnames: ['de la Vega'], ['van Buren']
887 prefixes = list(filter(self.is_prefix, pieces))
888 if prefixes:
889 for prefix in prefixes:
890 try:
891 i = pieces.index(prefix)
892 except ValueError:
893 # If the prefix is no longer in pieces, it's because it has been
894 # combined with the prefix that appears right before (or before that when
895 # chained together) in the last loop, so the index of that newly created
896 # piece is the same as in the last loop, i==i still, and we want to join
897 # it to the next piece.
898 pass
900 new_piece = ''
902 # join everything after the prefix until the next prefix or suffix
904 try:
905 if i == 0 and total_length >= 1:
906 # If it's the first piece and there are more than 1 rootnames, assume it's a first name
907 continue
908 next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:])))
909 j = pieces.index(next_prefix)
910 if j == i + 1:
911 # if there are two prefixes in sequence, join to the following piece
912 j += 1
913 new_piece = ' '.join(pieces[i:j])
914 pieces = pieces[:i] + [new_piece] + pieces[j:]
915 except StopIteration:
916 try:
917 # if there are no more prefixes, look for a suffix to stop at
918 stop_at = next(iter(filter(self.is_suffix, pieces[i + 1:])))
919 j = pieces.index(stop_at)
920 new_piece = ' '.join(pieces[i:j])
921 pieces = pieces[:i] + [new_piece] + pieces[j:]
922 except StopIteration:
923 # if there were no suffixes, nothing to stop at so join all
924 # remaining pieces
925 new_piece = ' '.join(pieces[i:])
926 pieces = pieces[:i] + [new_piece]
928 log.debug("pieces: %s", pieces)
929 return pieces
931 # Capitalization Support
933 def cap_word(self, word, attribute):
934 if (self.is_prefix(word) and attribute in ('last', 'middle')) \
935 or self.is_conjunction(word):
936 return word.lower()
937 exceptions = self.C.capitalization_exceptions
938 if lc(word) in exceptions:
939 return exceptions[lc(word)]
940 mac_match = self.C.regexes.mac.match(word)
941 if mac_match:
942 def cap_after_mac(m):
943 return m.group(1).capitalize() + m.group(2).capitalize()
944 return self.C.regexes.mac.sub(cap_after_mac, word)
945 else:
946 return word.capitalize()
948 def cap_piece(self, piece, attribute):
949 if not piece:
950 return ""
952 def replacement(m): return self.cap_word(m.group(0), attribute)
953 return self.C.regexes.word.sub(replacement, piece)
955 def capitalize(self, force=None):
956 """
957 The HumanName class can try to guess the correct capitalization of name
958 entered in all upper or lower case. By default, it will not adjust the
959 case of names entered in mixed case. To run capitalization on all names
960 pass the parameter `force=True`.
962 :param bool force: Forces capitalization of mixed case strings. This
963 parameter overrides rules set within
964 :py:class:`~nameparser.config.CONSTANTS`.
966 **Usage**
968 .. doctest:: capitalize
970 >>> name = HumanName('bob v. de la macdole-eisenhower phd')
971 >>> name.capitalize()
972 >>> str(name)
973 'Bob V. de la MacDole-Eisenhower Ph.D.'
974 >>> # Don't touch good names
975 >>> name = HumanName('Shirley Maclaine')
976 >>> name.capitalize()
977 >>> str(name)
978 'Shirley Maclaine'
979 >>> name.capitalize(force=True)
980 >>> str(name)
981 'Shirley MacLaine'
983 """
984 name = u(self)
985 force = self.C.force_mixed_case_capitalization \
986 if force is None else force
988 if not force and not (name == name.upper() or name == name.lower()):
989 return
990 self.title_list = self.cap_piece(self.title, 'title').split(' ')
991 self.first_list = self.cap_piece(self.first, 'first').split(' ')
992 self.middle_list = self.cap_piece(self.middle, 'middle').split(' ')
993 self.last_list = self.cap_piece(self.last, 'last').split(' ')
994 self.suffix_list = self.cap_piece(self.suffix, 'suffix').split(', ')
996 def handle_capitalization(self):
997 """
998 Handles capitalization configurations set within
999 :py:class:`~nameparser.config.CONSTANTS`.
1000 """
1001 if self.C.capitalize_name:
1002 self.capitalize()