Coverage for blind_charging/officer.py: 78%
133 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-17 20:36 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-17 20:36 +0000
1import itertools
2import re
3from collections import defaultdict
4from typing import DefaultDict, List, Optional
6from .individual import Individual, MergeDifferentPersonsError
7from .locale import Locale
8from .source_text import nlp
11def _get_name_pattern(
12 parts: List[Optional[str]], star: Optional[str] = None
13) -> Optional[str]:
14 """Get a regular expression that joins a set of parts.
16 :param parts: Parts to join with space pattern
17 :param star: Optional star part to append to the end
18 :returns: Regex, or None if there were no parts to join
19 """
20 non_null = filter(None, parts)
21 if not non_null:
22 return None
23 pfx = r"\s+".join(non_null)
24 if star:
25 return pfx + star
26 return pfx
29class OfficerName(Individual):
30 # TODO(itsmrlin): double check name regex to prevent catastrophic backtracking
31 # TODO(itsmrlin): automate capitalization variation
32 # officer title re
33 t_re = (
34 r"(Sheriff|Insp\.?|Inspector|Officer|Ofc\.?|"
35 r"Off\.?|Sergeant|Sgt\.?|Commissioner|Comm\.?|"
36 r"commissioner|comm\.?|FTO|PSA)\s+"
37 )
38 # 5 digit code re
39 dgt5_re = r"(\s|^)?\(?[0-9][A-Z][0-9A-Z]{2,3}\)?(\s|\.|$)"
40 # name regex
41 n_re = r"[A-Z][A-Za-z\-\']*\s*"
42 # star regex
43 star_re = r"(?:#\s*)([0-9]{3,5})\b"
45 def __init__(self, name):
46 ofc_str = name
47 self._dict = {"ofc_str": ofc_str}
48 self.dgt5_code = None
49 self.star = None
50 self.title = None
51 self.name = []
52 self.code_name = None
53 self.cls = ""
54 self.officer_titles = {
55 "OFFICER": "Officer",
56 "OFC": "Officer",
57 "OFF": "Officer",
58 "SERGEANT": "Sergeant",
59 "SGT": "Sergeant",
60 "INSPECTOR": "Inspector",
61 "INSP": "Inspector",
62 "SHERIFF": "Sheriff",
63 "COMMISSIONER": "Commissioner",
64 "COMM": "Commissioner",
65 "FTO": "FTO",
66 "PSA": "PSA",
67 }
68 # title to abbr.
69 self.t2abbr = {}
70 for k in self.officer_titles.keys():
71 if self.officer_titles[k] not in self.t2abbr:
72 self.t2abbr[self.officer_titles[k]] = [k]
73 else:
74 self.t2abbr[self.officer_titles[k]].append(k)
76 if re.search(OfficerName.star_re, ofc_str):
77 star_no = re.search(OfficerName.star_re, ofc_str).group(1)
78 self.star = str(star_no)
80 parts = ofc_str.split()
82 for pp in parts:
83 p = pp.upper().strip()
84 m = re.match(r"\b[A-Za-z\-\']+\b", p)
85 is_officer_title = p.strip(".") in self.officer_titles
86 if not is_officer_title and m and not nlp.vocab[p].is_stop:
87 p_clean = re.sub(r"[^A-Z]+$", "", p)
88 self.name.append(p_clean)
89 elif is_officer_title:
90 self.title = self.officer_titles[p.strip(".")]
91 elif re.match(OfficerName.dgt5_re, p) is not None:
92 self.dgt5_code = p.strip("(").strip(")")
94 if not self.name:
95 self.name = set()
96 else:
97 self.name = set(self.name)
99 def __eq__(self, other):
100 # dgt5_code is likely a shift number (including 2 officers) and
101 # we do not match officer based on that
102 # but we DO know if they are different then officers are different
103 if self.dgt5_code != other.dgt5_code:
104 return False
106 if self.star is not None and self.star == other.star:
107 return True
109 for n1 in self.name:
110 for n2 in other.name:
111 if n1 == n2:
112 return True
114 return False
116 def __hash__(self):
117 return hash(str(self))
119 def __str__(self):
120 return str(
121 {
122 "code_name": self.code_name,
123 "dgt5_code": self.dgt5_code,
124 "star": self.star,
125 "title": self.title,
126 "name": self.name,
127 }
128 )
130 def __repr__(self):
131 return self.__str__()
133 def get_indicator(self):
134 if self.code_name is None:
135 return "CODE NAME NOT ASSIGNED!"
137 return self.code_name
139 def to_dict(self):
140 return self._dict.copy()
142 def merge(self, other):
143 if self != other:
144 raise MergeDifferentPersonsError
146 if self.dgt5_code is None:
147 self.dgt5_code = other.dgt5_code
149 if self.star is None:
150 self.star = other.star
152 if self.title is None:
153 self.title = other.title
155 self.name = self.name.union(other.name)
157 def _name_rep_impl(self):
158 reps = set()
159 combined_names = [
160 r"\s+".join(x)
161 for i in range(1, 3)
162 for x in itertools.permutations(self.name, i)
163 ]
165 dgt5_code = None if self.dgt5_code is None else r"\(?%s\)?" % self.dgt5_code
166 # 1A23B
167 reps.add(dgt5_code)
169 if self.star:
170 star_no = self.star
171 star_regex = r"\s*#\s*" + star_no
172 reps.add(star_regex)
173 else:
174 star_regex = None
176 for n in combined_names:
177 # John Doe
178 reps.add(_get_name_pattern([n]))
179 # John Doe #1234
180 reps.add(_get_name_pattern([n], star_regex))
181 if dgt5_code:
182 # 1A23B John Doe #1234
183 reps.add(_get_name_pattern([dgt5_code, n], star_regex))
185 if self.title is not None:
186 for t in self.t2abbr[self.title]:
187 if self.star is not None:
188 # Officer #1234
189 reps.add(_get_name_pattern([t + r"\.?"], star_regex))
190 if dgt5_code:
191 # 1A23B Officer #1234
192 reps.add(_get_name_pattern([dgt5_code, t + r"\.?"], star_regex))
194 for n in combined_names:
196 for t in self.t2abbr[self.title]:
197 # officer john doe
198 reps.add(_get_name_pattern([t + r"\.?", n]))
199 # officer john doe #1234
200 reps.add(_get_name_pattern([t + r"\.?", n], star_regex))
201 if dgt5_code:
202 # 1a23b officer john doe #1234
203 reps.add(
204 _get_name_pattern([dgt5_code, t + r"\.?", n], star_regex)
205 )
207 if None in reps:
208 reps.remove(None)
210 reps = {r"\b%s\b" % x for x in reps}
212 # TODO(jnu): the longest pattern is not necessarily going to yield the
213 # longest match. It's an ok heuristic for now, but really we should
214 # match all the patterns and resolve ovleraps by choosing the longest
215 # match.
216 reps = sorted(reps, key=lambda x: len(x), reverse=True)
217 return reps
219 @classmethod
220 def dedupe(
221 cls, officers: List["OfficerName"], locale: Locale
222 ) -> List["OfficerName"]:
223 """Merge duplicated officer references.
225 :param officers: List of officers
226 :param locale: Location information
227 :returns: De-duplicated list of officers
228 """
229 persons = super(OfficerName, cls).dedupe(officers, locale)
231 type_counts: DefaultDict[str, int] = defaultdict(int)
232 for p in persons:
233 if p.star is not None and p.title is None:
234 title = "Officer"
235 else:
236 title = p.title
238 if not title:
239 # if no title and no star, but has the 5 digit code
240 # it's probably a team/partnership
241 if p.dgt5_code is not None:
242 p.code_name = "[officer pair]"
243 else:
244 p.code_name = "an officer"
245 else:
246 type_counts[title] += 1
247 p.code_name = "%s #%d" % (title, type_counts[title])
248 # TODO(jnu): clean up how the class is applied
249 p.cls = "masked-officer"
251 return persons