Coverage for blind_charging/officer.py: 78%

133 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-17 20:36 +0000

1import itertools 

2import re 

3from collections import defaultdict 

4from typing import DefaultDict, List, Optional 

5 

6from .individual import Individual, MergeDifferentPersonsError 

7from .locale import Locale 

8from .source_text import nlp 

9 

10 

11def _get_name_pattern( 

12 parts: List[Optional[str]], star: Optional[str] = None 

13) -> Optional[str]: 

14 """Get a regular expression that joins a set of parts. 

15 

16 :param parts: Parts to join with space pattern 

17 :param star: Optional star part to append to the end 

18 :returns: Regex, or None if there were no parts to join 

19 """ 

20 non_null = filter(None, parts) 

21 if not non_null: 

22 return None 

23 pfx = r"\s+".join(non_null) 

24 if star: 

25 return pfx + star 

26 return pfx 

27 

28 

29class OfficerName(Individual): 

30 # TODO(itsmrlin): double check name regex to prevent catastrophic backtracking 

31 # TODO(itsmrlin): automate capitalization variation 

32 # officer title re 

33 t_re = ( 

34 r"(Sheriff|Insp\.?|Inspector|Officer|Ofc\.?|" 

35 r"Off\.?|Sergeant|Sgt\.?|Commissioner|Comm\.?|" 

36 r"commissioner|comm\.?|FTO|PSA)\s+" 

37 ) 

38 # 5 digit code re 

39 dgt5_re = r"(\s|^)?\(?[0-9][A-Z][0-9A-Z]{2,3}\)?(\s|\.|$)" 

40 # name regex 

41 n_re = r"[A-Z][A-Za-z\-\']*\s*" 

42 # star regex 

43 star_re = r"(?:#\s*)([0-9]{3,5})\b" 

44 

45 def __init__(self, name): 

46 ofc_str = name 

47 self._dict = {"ofc_str": ofc_str} 

48 self.dgt5_code = None 

49 self.star = None 

50 self.title = None 

51 self.name = [] 

52 self.code_name = None 

53 self.cls = "" 

54 self.officer_titles = { 

55 "OFFICER": "Officer", 

56 "OFC": "Officer", 

57 "OFF": "Officer", 

58 "SERGEANT": "Sergeant", 

59 "SGT": "Sergeant", 

60 "INSPECTOR": "Inspector", 

61 "INSP": "Inspector", 

62 "SHERIFF": "Sheriff", 

63 "COMMISSIONER": "Commissioner", 

64 "COMM": "Commissioner", 

65 "FTO": "FTO", 

66 "PSA": "PSA", 

67 } 

68 # title to abbr. 

69 self.t2abbr = {} 

70 for k in self.officer_titles.keys(): 

71 if self.officer_titles[k] not in self.t2abbr: 

72 self.t2abbr[self.officer_titles[k]] = [k] 

73 else: 

74 self.t2abbr[self.officer_titles[k]].append(k) 

75 

76 if re.search(OfficerName.star_re, ofc_str): 

77 star_no = re.search(OfficerName.star_re, ofc_str).group(1) 

78 self.star = str(star_no) 

79 

80 parts = ofc_str.split() 

81 

82 for pp in parts: 

83 p = pp.upper().strip() 

84 m = re.match(r"\b[A-Za-z\-\']+\b", p) 

85 is_officer_title = p.strip(".") in self.officer_titles 

86 if not is_officer_title and m and not nlp.vocab[p].is_stop: 

87 p_clean = re.sub(r"[^A-Z]+$", "", p) 

88 self.name.append(p_clean) 

89 elif is_officer_title: 

90 self.title = self.officer_titles[p.strip(".")] 

91 elif re.match(OfficerName.dgt5_re, p) is not None: 

92 self.dgt5_code = p.strip("(").strip(")") 

93 

94 if not self.name: 

95 self.name = set() 

96 else: 

97 self.name = set(self.name) 

98 

99 def __eq__(self, other): 

100 # dgt5_code is likely a shift number (including 2 officers) and 

101 # we do not match officer based on that 

102 # but we DO know if they are different then officers are different 

103 if self.dgt5_code != other.dgt5_code: 

104 return False 

105 

106 if self.star is not None and self.star == other.star: 

107 return True 

108 

109 for n1 in self.name: 

110 for n2 in other.name: 

111 if n1 == n2: 

112 return True 

113 

114 return False 

115 

116 def __hash__(self): 

117 return hash(str(self)) 

118 

119 def __str__(self): 

120 return str( 

121 { 

122 "code_name": self.code_name, 

123 "dgt5_code": self.dgt5_code, 

124 "star": self.star, 

125 "title": self.title, 

126 "name": self.name, 

127 } 

128 ) 

129 

130 def __repr__(self): 

131 return self.__str__() 

132 

133 def get_indicator(self): 

134 if self.code_name is None: 

135 return "CODE NAME NOT ASSIGNED!" 

136 

137 return self.code_name 

138 

139 def to_dict(self): 

140 return self._dict.copy() 

141 

142 def merge(self, other): 

143 if self != other: 

144 raise MergeDifferentPersonsError 

145 

146 if self.dgt5_code is None: 

147 self.dgt5_code = other.dgt5_code 

148 

149 if self.star is None: 

150 self.star = other.star 

151 

152 if self.title is None: 

153 self.title = other.title 

154 

155 self.name = self.name.union(other.name) 

156 

157 def _name_rep_impl(self): 

158 reps = set() 

159 combined_names = [ 

160 r"\s+".join(x) 

161 for i in range(1, 3) 

162 for x in itertools.permutations(self.name, i) 

163 ] 

164 

165 dgt5_code = None if self.dgt5_code is None else r"\(?%s\)?" % self.dgt5_code 

166 # 1A23B 

167 reps.add(dgt5_code) 

168 

169 if self.star: 

170 star_no = self.star 

171 star_regex = r"\s*#\s*" + star_no 

172 reps.add(star_regex) 

173 else: 

174 star_regex = None 

175 

176 for n in combined_names: 

177 # John Doe 

178 reps.add(_get_name_pattern([n])) 

179 # John Doe #1234 

180 reps.add(_get_name_pattern([n], star_regex)) 

181 if dgt5_code: 

182 # 1A23B John Doe #1234 

183 reps.add(_get_name_pattern([dgt5_code, n], star_regex)) 

184 

185 if self.title is not None: 

186 for t in self.t2abbr[self.title]: 

187 if self.star is not None: 

188 # Officer #1234 

189 reps.add(_get_name_pattern([t + r"\.?"], star_regex)) 

190 if dgt5_code: 

191 # 1A23B Officer #1234 

192 reps.add(_get_name_pattern([dgt5_code, t + r"\.?"], star_regex)) 

193 

194 for n in combined_names: 

195 

196 for t in self.t2abbr[self.title]: 

197 # officer john doe 

198 reps.add(_get_name_pattern([t + r"\.?", n])) 

199 # officer john doe #1234 

200 reps.add(_get_name_pattern([t + r"\.?", n], star_regex)) 

201 if dgt5_code: 

202 # 1a23b officer john doe #1234 

203 reps.add( 

204 _get_name_pattern([dgt5_code, t + r"\.?", n], star_regex) 

205 ) 

206 

207 if None in reps: 

208 reps.remove(None) 

209 

210 reps = {r"\b%s\b" % x for x in reps} 

211 

212 # TODO(jnu): the longest pattern is not necessarily going to yield the 

213 # longest match. It's an ok heuristic for now, but really we should 

214 # match all the patterns and resolve ovleraps by choosing the longest 

215 # match. 

216 reps = sorted(reps, key=lambda x: len(x), reverse=True) 

217 return reps 

218 

219 @classmethod 

220 def dedupe( 

221 cls, officers: List["OfficerName"], locale: Locale 

222 ) -> List["OfficerName"]: 

223 """Merge duplicated officer references. 

224 

225 :param officers: List of officers 

226 :param locale: Location information 

227 :returns: De-duplicated list of officers 

228 """ 

229 persons = super(OfficerName, cls).dedupe(officers, locale) 

230 

231 type_counts: DefaultDict[str, int] = defaultdict(int) 

232 for p in persons: 

233 if p.star is not None and p.title is None: 

234 title = "Officer" 

235 else: 

236 title = p.title 

237 

238 if not title: 

239 # if no title and no star, but has the 5 digit code 

240 # it's probably a team/partnership 

241 if p.dgt5_code is not None: 

242 p.code_name = "[officer pair]" 

243 else: 

244 p.code_name = "an officer" 

245 else: 

246 type_counts[title] += 1 

247 p.code_name = "%s #%d" % (title, type_counts[title]) 

248 # TODO(jnu): clean up how the class is applied 

249 p.cls = "masked-officer" 

250 

251 return persons