Coverage for blind_charging/masker.py: 90%

245 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2025-10-20 15:43 +0000

1import itertools 

2import re 

3from typing import DefaultDict, Dict, Generator, Iterable, List, Set, Union 

4 

5from .annotation import Redaction 

6from .locale import Locale 

7from .locale.const import USPS_STREET_ABBR 

8from .mask_const import ( 

9 APPEARANCE_LIST, 

10 COUNTRIES, 

11 EYE_COLORS, 

12 EYE_REF, 

13 GENERAL_COLORS, 

14 HAIR_ADJS, 

15 HAIR_COLORS, 

16 HAIR_REF, 

17 LANGUAGES, 

18 NATIONALITIES, 

19 PERSON_REF, 

20 RACE_ABBREV, 

21 RACE_FEATURES, 

22 RACE_WORDS, 

23 SENSITIVE_HAIR_REF, 

24 SKIN_COLORS, 

25) 

26from .officer import OfficerName 

27from .person import PersonName, _name_match 

28from .re_util import re_literal_group 

29from .source_text import SourceText 

30from .text_processing import get_officers_from_narrative, get_persons_from_narrative 

31 

32AnyPerson = Union[OfficerName, PersonName] 

33 

34 

35# TODO(jnu): rewrite to generalize common behaviors. Really we only have three 

36# approaches: using PersonNames, using RegEx, and using NER. Generalize these 

37# as first-class rules that can be parameterized and applied. 

38 

39 

40def _re_literal_adj_list(adjectives: Iterable[str]) -> str: 

41 """ 

42 Create a RegExp pattern for matching a list of adjectives with literals. 

43 

44 :param adjectives: List of adjective literals 

45 :returns: RegExp pattern 

46 """ 

47 

48 adj_group = re_literal_group(adjectives) 

49 conj_group = re_literal_group(["and", "or"], capture=False) 

50 conj_sym_group = re_literal_group(["&", "/"], capture=False) 

51 det_group = re_literal_group(["a", "an", "the", "some", "any"], capture=False) 

52 

53 return ( 

54 # fmt: off 

55 r"\b{adj}(?:\s+,?\s*{adj},?)*" 

56 r"(?:(?:\s+{cnj}\s+|\s*{cnj_sym}\s*)(?:{det}\s+)?{adj}(?:\s+,?\s*{adj},?)*)?\b" 

57 # fmt: on 

58 ).format(adj=adj_group, cnj=conj_group, cnj_sym=conj_sym_group, det=det_group) 

59 

60 

61def _re_literal_noun_phrase(adjectives: Iterable[str], nouns: Iterable[str]) -> str: 

62 """Create a RegExp pattern for matching a simple noun phrase with literals. 

63 

64 Example: 

65 pattern = f(["green", "black"], ["frog", "toad"]) 

66 

67 This pattern would match "green frog" and "black toad" and even 

68 "green and black toad." 

69 

70 :param adjectives: List of adjective literals 

71 :param noun: List of noun literals 

72 :returns: RegExp pattern 

73 """ 

74 adj_list = _re_literal_adj_list(adjectives) 

75 noun_group = re_literal_group(nouns, name="noun") 

76 

77 return r"{adj}\s+{n}\b".format(adj=adj_list, n=noun_group) 

78 

79 

80def _redact_entities( 

81 doc: SourceText, literals: Iterable[str], placeholder: str, info: str = "" 

82) -> Generator[Redaction, None, None]: 

83 """Redact NLP entities matching the given list. 

84 

85 :param doc: Source text 

86 :param literals: List of literal strings to match 

87 :param placeholder: String to use in lieu of matched entities 

88 :param info: Comment to pass to redaction for tracing 

89 :yields: Redactions 

90 """ 

91 search_names = re_literal_group(literals, capture=False) 

92 # matches search names lazily to allow for longest search name match 

93 search_pattern = r"(.*?\s+)??\b{}\b(\s+.*)?".format(search_names) 

94 search_re = re.compile(search_pattern, re.IGNORECASE) 

95 

96 for ent in doc.nlp.ents[::-1]: 

97 if not doc.can_redact(ent.start_char, ent.end_char): 

98 continue 

99 m = search_re.match(ent.text) 

100 if m: 

101 start = ent.start_char 

102 end = ent.end_char 

103 pfx = m.group(1) or "" 

104 sfx = m.group(2) or "" 

105 replacement = "{}[{}]{}".format(pfx, placeholder, sfx) 

106 yield doc.redact(start, end, replacement, info=info) 

107 

108 

109def _redact_words( 

110 doc: SourceText, literals: Iterable[str], placeholder: str, info: str = "" 

111) -> Generator[Redaction, None, None]: 

112 """Redact words as tokenized by NLP. 

113 

114 :param doc: Source text 

115 :param literals: List of literal strings to match 

116 :param placeholder: String to use in lieu of matching words 

117 :param info: Comment to pass to redaction for tracing 

118 :yields: Redaction 

119 """ 

120 candidates = set(literals) 

121 replacement = "[{}]".format(placeholder) 

122 

123 for word in list(doc.nlp)[::-1]: 

124 start_char = word.idx 

125 end_char = start_char + len(word) 

126 if not doc.can_redact(start_char, end_char): 

127 continue 

128 if word.text in candidates: 

129 yield doc.redact(start_char, end_char, replacement, info=info) 

130 

131 

132def mask_skin_color( 

133 doc: SourceText, placeholder: str = "race/ethnicity" 

134) -> Generator[Redaction, None, None]: 

135 """Generate redactions for words used to describe skin color. 

136 

137 E.g., "black person" -> "[race/ethnicity] person" 

138 

139 NOTE: There may be overlap here with rules that deal with ethnicity 

140 directly. 

141 

142 :param doc: Source text 

143 :param placeholder: String to use in lieu of skin color words. 

144 :yields: Redactions 

145 """ 

146 pattern = _re_literal_noun_phrase(SKIN_COLORS | RACE_WORDS, PERSON_REF) 

147 skin_color_re = re.compile(pattern, re.IGNORECASE) 

148 

149 for match in skin_color_re.finditer(doc.text): 

150 start, end = match.span() 

151 replacement = "[{}] {}".format(placeholder, match.group("noun")) 

152 yield doc.redact(start, end, replacement, info="skin color") 

153 

154 

155def mask_hair_color( 

156 doc: SourceText, placeholder: str = "color" 

157) -> Generator[Redaction, None, None]: 

158 """Generate redactions for hair color. 

159 

160 E.g., "red hair" -> "[color] hair" 

161 

162 :param doc: Source text 

163 :param placeholder: String to use in lieu of color word 

164 :yields: Redactions 

165 """ 

166 hair_colors = GENERAL_COLORS | HAIR_COLORS 

167 pattern = _re_literal_noun_phrase(hair_colors, HAIR_REF) 

168 hair_color_re = re.compile(pattern, re.IGNORECASE) 

169 

170 for match in hair_color_re.finditer(doc.text): 

171 start, end = match.span() 

172 replacement = "[{}] {}".format(placeholder, match.group("noun")) 

173 yield doc.redact(start, end, replacement, info="hair color") 

174 

175 

176def mask_hair_style( 

177 doc: SourceText, placeholder: str = "hairstyle" 

178) -> Generator[Redaction, None, None]: 

179 """Generate redactions for hair styles. 

180 

181 E.g., "black short afro hair" -> "[hairstyle] hair" 

182 

183 :param doc: Source text 

184 :param placeholder: String to use in lieu of hair style 

185 :yields: Redaction 

186 """ 

187 hairstyle_adjs = SENSITIVE_HAIR_REF | HAIR_ADJS | GENERAL_COLORS | HAIR_COLORS 

188 hair_nouns = SENSITIVE_HAIR_REF | HAIR_REF 

189 replacement = "[{}] hair".format(placeholder) 

190 

191 for pattern in [ 

192 _re_literal_noun_phrase(hairstyle_adjs, hair_nouns), 

193 re_literal_group(SENSITIVE_HAIR_REF), 

194 ]: 

195 hairstyle_re = re.compile(pattern, re.IGNORECASE) 

196 for match in hairstyle_re.finditer(doc.text): 

197 start, end = match.span() 

198 yield doc.redact(start, end, replacement, info="hair style") 

199 

200 

201def mask_eye_color( 

202 doc: SourceText, placeholder: str = "color" 

203) -> Generator[Redaction, None, None]: 

204 """Generate redactions for eye color. 

205 

206 E.g., "blue eyes" -> "[color] eyes" 

207 

208 :param doc: Source text 

209 :param placeholder: String to use in lieu of color word 

210 :yields: Redactions 

211 """ 

212 eye_colors = GENERAL_COLORS | EYE_COLORS 

213 pattern = _re_literal_noun_phrase(eye_colors, EYE_REF) 

214 eye_color_re = re.compile(pattern, re.IGNORECASE) 

215 

216 for match in eye_color_re.finditer(doc.text): 

217 start, end = match.span() 

218 replacement = "[{}] {}".format(placeholder, match.group("noun")) 

219 yield doc.redact(start, end, replacement, info="eye color") 

220 

221 

222def mask_country( 

223 doc: SourceText, placeholder: str = "country" 

224) -> Generator[Redaction, None, None]: 

225 """Generate redactions for country names. 

226 

227 E.g., "Burundi" -> "[country]" 

228 

229 :param doc: Source text 

230 :param placeholder: String to use in lieu of country name 

231 :yields: Redactions 

232 """ 

233 yield from _redact_entities(doc, COUNTRIES, placeholder, info="country") 

234 yield from _redact_words(doc, COUNTRIES, placeholder, info="country") 

235 

236 

237def mask_language( 

238 doc: SourceText, placeholder: str = "language" 

239) -> Generator[Redaction, None, None]: 

240 """Generate redactions for nationalities. 

241 

242 E.g., "Spanish" -> "[language]" 

243 

244 :param doc: Source text 

245 :param placeholder: String to use in lieu of language 

246 :yields: Redactions 

247 """ 

248 yield from _redact_entities(doc, LANGUAGES, placeholder, info="language") 

249 yield from _redact_words(doc, LANGUAGES, placeholder, info="language") 

250 

251 

252def mask_nationality( 

253 doc: SourceText, placeholder: str = "nationality/ethnicity" 

254) -> Generator[Redaction, None, None]: 

255 """Generate redactions for nationalities. 

256 

257 E.g., "Mexican" -> "[nationality/ethnicity]" 

258 

259 :param doc: Source text 

260 :param placeholder: String to use in lieu of nationality 

261 :yields: Redactions 

262 """ 

263 # NOTE(acw): Tried using spacy's NER classifier alone here, but it would 

264 # too often classify irrelevant words (e.g., "5/18/2019" or "Silver Honda") 

265 # as languages or locations. 

266 yield from _redact_entities(doc, NATIONALITIES, placeholder, info="nationality") 

267 yield from _redact_words(doc, NATIONALITIES, placeholder, info="nationality") 

268 

269 

270def mask_race( 

271 doc: SourceText, placeholder: str = "race/ethnicity" 

272) -> Generator[Redaction, None, None]: 

273 """Generate redactions for words that directly indicate race. 

274 

275 E.g., "African American" -> "[race/ethnicity]" 

276 

277 :param doc: Source text 

278 :param placeholder: String to use in lieu of race/ethnicity 

279 :yields: Redactions 

280 """ 

281 pattern = _re_literal_adj_list(RACE_WORDS) 

282 race_re = re.compile(pattern, re.IGNORECASE) 

283 replacement = "[{}]".format(placeholder) 

284 

285 for match in race_re.finditer(doc.text): 

286 start, end = match.span() 

287 yield doc.redact(start, end, replacement, info="race") 

288 

289 

290def mask_other_literals( 

291 doc: SourceText, 

292 literals: dict[str, list[str]] | None, 

293) -> Generator[Redaction, None, None]: 

294 """Generate redactions based on custom lists of literal words. 

295 

296 Example: 

297 literals = { 

298 "district": ["lake district", "park district"], 

299 } 

300 

301 "The suspect was last seen the Park District" -> 

302 "The suspect was last seen in the [district]" 

303 

304 :param doc: Source text 

305 :param literals: Dictionary describing literal words to redact. Keys will 

306 be used to substitute for each of the values in the associated list. 

307 :yields: Redactions 

308 """ 

309 if literals is None: 

310 return 

311 

312 for literal, values in literals.items(): 

313 pattern = re_literal_group(values) 

314 literal_re = re.compile(pattern, re.IGNORECASE) 

315 replacement = "[{}]".format(literal) 

316 

317 for match in literal_re.finditer(doc.text): 

318 start, end = match.span() 

319 yield doc.redact(start, end, replacement, info=literal) 

320 

321 

322def mask_race_correlated_feature( 

323 doc: SourceText, placeholder: str = "physical description" 

324) -> Generator[Redaction, None, None]: 

325 """Generate redactions for feature that are highly correlated with race 

326 without context. 

327 

328 E.g., "We saw a blonde" -> "We saw a [physical description]" 

329 

330 :param doc: Source text 

331 :param placeholder: String to use in lieu of race-correlated features 

332 :yields: Redactions 

333 """ 

334 feature_group = re_literal_group(RACE_FEATURES) 

335 pattern = r"\b{}\b".format(feature_group) 

336 feature_re = re.compile(pattern, re.IGNORECASE) 

337 replacement = "[{}]".format(placeholder) 

338 

339 for match in feature_re.finditer(doc.text): 

340 start, end = match.span() 

341 yield doc.redact(start, end, replacement, info="race") 

342 

343 

344def mask_race_abbrev( 

345 doc: SourceText, placeholder: str = "race/ethnicity" 

346) -> Generator[Redaction, None, None]: 

347 """Generate redactions for abbreviated words that directly indicate race. 

348 

349 E.g., "AMA" -> "[race/ethnicity] male adult" 

350 

351 :param doc: Source text 

352 :param placeholder: String to use in lieu of race/ethnicity 

353 :yields: Redactions 

354 """ 

355 race_group = RACE_ABBREV 

356 pattern = r"(?<=\b){}s?(?=\b)".format(race_group) 

357 race_re = re.compile(pattern) # dont ignore case 

358 

359 sex_dict = {"F": "female", "M": "male"} 

360 age_dict = {"A": "adult", "J": "juvenile"} 

361 

362 for match in race_re.finditer(doc.text): 

363 start, end = match.span() 

364 # insert female/male, adult/juvenile depending on 2nd and 3rd groups 

365 replacement = "[{}] {} {}".format( 

366 placeholder, sex_dict.get(match.group(2)), age_dict.get(match.group(3)) 

367 ) 

368 yield doc.redact(start, end, replacement, info="race") 

369 

370 

371def mask_appearance_list( 

372 doc: SourceText, placeholder: str = "color" 

373) -> Generator[Redaction, None, None]: 

374 """Generate redactions for words in list format that directly indicate race. 

375 

376 E.g., "Race: Hispanic" -> "Race: [race/ethnicity]" 

377 E.g., "Hair: Black" -> "Hair: [color]" 

378 

379 :param doc: Source text 

380 :param placeholder: String to use in lieu of feature 

381 :yields: Redactions 

382 """ 

383 color_group = _re_literal_adj_list( 

384 SKIN_COLORS | HAIR_COLORS | HAIR_ADJS | EYE_COLORS | GENERAL_COLORS 

385 ) 

386 appearance_group = re_literal_group(APPEARANCE_LIST, name="noun") 

387 pattern = r"{}:\s*{}".format(appearance_group, color_group) 

388 appearance_list_re = re.compile(pattern, re.IGNORECASE) 

389 

390 for match in appearance_list_re.finditer(doc.text): 

391 if match.group("noun").lower() in ["race", "complexion"]: 

392 placeholder = "race/ethnicity" 

393 info = "race" 

394 elif match.group("noun") == "eyes": 

395 info = "eye color" 

396 elif match.group("noun") == "hair": 

397 info = "hair color" 

398 else: 

399 info = "appearance list" 

400 

401 start, end = match.span() 

402 replacement = "{}: [{}]".format(match.group("noun"), placeholder) 

403 yield doc.redact(start, end, replacement, info=info) 

404 

405 

406def mask_street_address( 

407 doc: SourceText, placeholder: str = "location" 

408) -> Generator[Redaction, None, None]: 

409 """Generate redactions for street addresses. 

410 

411 E.g., "123 Maple St." -> "[location] St." 

412 

413 :param doc: Source text 

414 :param placeholder: Text to use in lieu of literal street address 

415 :yields: Redactions 

416 """ 

417 endings_group = re_literal_group(USPS_STREET_ABBR) 

418 street_addr_re = re.compile( 

419 r"(?:\d{1,5} [\w\s]{1,20}) (" + endings_group + r"\.?)\W?(?=\s|$)", 

420 re.IGNORECASE, 

421 ) 

422 

423 # Avoid matching false street locations: 

424 # e.g. 30 mph, #2 lane 

425 bad_patterns_re = re.compile( 

426 r"\d{1,3}\s?mph\b|\b#?\d\s?([nesw]/?b\s?)?lane\b", # speed | lane in road 

427 re.IGNORECASE, 

428 ) 

429 

430 for match in street_addr_re.finditer(doc.text): 

431 matched_text = match.group(0) 

432 if bad_patterns_re.search(matched_text): 

433 continue 

434 

435 start, end = match.span() 

436 replacement = "[{}] {}".format(placeholder, match.group(1)) 

437 yield doc.redact(start, end, replacement, info="street address") 

438 

439 

440def mask_district( 

441 doc: SourceText, locale: Locale, placeholder: str = "district" 

442) -> Generator[Redaction, None, None]: 

443 """Generate redactions for police precincts. 

444 

445 :param doc: Source text 

446 :param locale: Locale to use for masking 

447 :param placeholder: Text to use in lieu of literal district name 

448 :yields: Redactions 

449 """ 

450 for match in locale.match_district(doc.text): 

451 start, end = match.span() 

452 sfx = (match.group(2) or "").lower() 

453 # Avoid adding suffix if it'd be awkwardly redundant, as in the case 

454 # of "[district] district" 

455 sfx = "" if sfx == placeholder else sfx 

456 replacement = "[{}]".format(placeholder) 

457 if sfx: 

458 replacement += " " + sfx 

459 yield doc.redact(start, end, replacement, info="district name") 

460 

461 

462def mask_presumed_street_name( 

463 doc: SourceText, placeholder: str = "street" 

464) -> Generator[Redaction, None, None]: 

465 """Generate redactions for entities that look like street names. 

466 

467 E.g., "Maple St." -> "[street] St." 

468 

469 :param doc: Source text 

470 :param placeholder: Text to use in lieu of street name 

471 :yields: Redactions 

472 """ 

473 ending_variants = sum( 

474 [[abbr, abbr.capitalize(), abbr.upper()] for abbr in USPS_STREET_ABBR], 

475 list[str](), 

476 ) 

477 street_endings = re_literal_group(ending_variants, capture=False) 

478 street_name_pattern = ( 

479 r"(?:(?:\d+|[A-Z])[A-Za-z\']*\s+)+" 

480 + r"(%s\.?)" % street_endings 

481 + r"(?=[,\/#!$%\^&\*;:{}=\-_`~()\s])" 

482 ) 

483 # Last pattern matches any `\b` except `\.` (matched in second pattern) 

484 # This keeps the period (e.g. in "St.") in the placeholder 

485 # NOTE(jnu): this is not case insensitive; the point is to use the 

486 # capitalization structure to infer words that might constitute a street 

487 # name. 

488 street_name_re = re.compile(street_name_pattern) 

489 

490 # Avoid matching false street names: 

491 # e.g. EB lane, E/B lane, #2 lane (on the freeway) 

492 bad_patterns_re = re.compile(r"\b(#?\d\s)?([nesw]/?b\s?)?lane\b", re.IGNORECASE) 

493 

494 for match in street_name_re.finditer(doc.text): 

495 matched_text = match.group(0) 

496 if bad_patterns_re.search(matched_text): 

497 continue 

498 

499 start, end = match.span() 

500 replacement = "[{}] {}".format(placeholder, match.group(1)) 

501 yield doc.redact(start, end, replacement, info="presumed street name") 

502 

503 

504def mask_known_street_name( 

505 doc: SourceText, locale: Locale, placeholder: str = "street" 

506) -> Generator[Redaction, None, None]: 

507 """Generate redactions for known streets in the city. 

508 

509 E.g., "Arguello and Euclid" -> "[street] and [street]" 

510 

511 :param doc: Source text 

512 :param locale: Locale to use for masking 

513 :param placeholder: Text to use in lieu of street name 

514 :yields: Redactions 

515 """ 

516 for match in locale.match_street_name(doc.text): 

517 start, end = match.span() 

518 replacement = "[{placeholder}]{conj}[{placeholder}]".format( 

519 placeholder=placeholder, conj=match.group("conj") 

520 ) 

521 yield doc.redact(start, end, replacement, info="known street name") 

522 

523 

524def mask_neighborhood( 

525 doc: SourceText, locale: Locale, placeholder: str = "neighborhood" 

526) -> Generator[Redaction, None, None]: 

527 """Generate redactions for neighborhoods in the city. 

528 

529 E.g., "Parkside" -> "[neighborhood]" 

530 

531 :param doc: Source text 

532 :param locale: Locale to use to perform masking 

533 :param placeholder: Text to use in lieu of neighborhood name 

534 :yields: Redactions 

535 """ 

536 # TODO(jnu): improve Locale API for matching these 

537 yield from _redact_entities( 

538 doc, locale.neighborhoods, placeholder, info="neighborhood" 

539 ) 

540 

541 

542def _create_person_name_map(persons: Iterable[AnyPerson]) -> Dict[str, Set[AnyPerson]]: 

543 """Create a map from surface name representations to persons. 

544 

545 The map connects the surface representations of a human name (such as 

546 "John P. Smith") to the PersonName instances that this name could refer to. 

547 In most cases this should be unique, however there may be ambiguous cases 

548 such as "J. Smith" that might refer to multiple individuals. 

549 

550 :param persons: List of person references 

551 :returns: Map from names to person references 

552 """ 

553 m = DefaultDict[str, Set[AnyPerson]](set) 

554 

555 for p in persons: 

556 for s in p.name_rep(): 

557 m[s].add(p) 

558 

559 return dict(m) 

560 

561 

562def mask_person( 

563 doc: SourceText, 

564 persons: Iterable[AnyPerson], 

565 info: str, 

566) -> Generator[Redaction, None, None]: 

567 """Generate a list of redactions for the persons given in the input. 

568 

569 :param doc: Source text 

570 :param persons: List of person references to redact 

571 :param annotations: List of existing annotations (passed to avoid adding 

572 conflicting annotations on a range) 

573 :yields: Redaction instances 

574 """ 

575 person_signs = _create_person_name_map(persons) 

576 

577 # Process surface representations of names in order of longest to shortest. 

578 # This means the longest names will be replaced first, which should help to 

579 # avoid ambiguity. 

580 sorted_signs = sorted(person_signs.items(), key=lambda x: len(x[0]), reverse=True) 

581 

582 for signifier, signified in sorted_signs: 

583 # Ambiguous references: 

584 pattern = re.compile(signifier, re.IGNORECASE) 

585 ordered_signified = sorted(signified, key=lambda a: a.get_indicator()) 

586 if info == "officer": 

587 # replacement as "Officer #1 or Officer #2" 

588 codename = " or ".join([p.get_indicator() for p in ordered_signified]) 

589 elif info == "person": 

590 # replacement as "(PERSON_1 or PERSON_2)"" rather than "(PERSON_1) or (PERSON_2)"" 

591 codename = "(%s)" % " or ".join( 

592 [re.sub(r"[\(\)]", "", p.get_indicator()) for p in ordered_signified] 

593 ) 

594 

595 for match in pattern.finditer(doc.text): 

596 replacement = codename 

597 start, end = match.span() 

598 # Special case: the rare terminal-apostrophe possessive, such as 

599 # "Moses'" where the correct redaction synthetically adds the 's. 

600 # TODO(jnu): probably better to handle this where we handle the 

601 # indefinite article redaction, in SourceText. 

602 if doc.text[end : end + 2] == "' ": 

603 replacement = codename + "'s" 

604 end += 1 

605 

606 # TODO(jnu): clean up coloring and classing 

607 ordered_signified[0] 

608 yield doc.redact( 

609 start, 

610 end, 

611 replacement, 

612 auto_capitalize=False, 

613 autocorrect_article=False, 

614 info=info, 

615 ) 

616 

617 

618def mask_person_fuzzy( 

619 doc: SourceText, 

620 persons: Iterable[PersonName], 

621 info: str, 

622) -> Generator[Redaction, None, None]: 

623 """Generate a list of redactions for the persons given in the input 

624 by redacting proper nouns in the text which are similar to last names in 

625 persons. 

626 

627 :param doc: Source text 

628 :param persons: List of person references to redact 

629 :param annotations: List of existing annotations (passed to avoid adding 

630 conflicting annotations on a range) 

631 :yields: Redaction instances 

632 """ 

633 

634 min_character_limit = 5 

635 propn_tokens = { 

636 token 

637 for token in doc.nlp 

638 if token.pos_ == "PROPN" and len(token) > min_character_limit 

639 } 

640 

641 for token in propn_tokens: 

642 start_char = token.idx 

643 end_char = start_char + len(token) 

644 

645 if not doc.can_redact(start_char, end_char): 

646 continue 

647 else: 

648 valid_persons = [ 

649 person 

650 for person in persons 

651 if _name_match({f"{person.first} {person.last}"}, {token.text.upper()}) 

652 or _name_match(person.last, {token.text.upper()}, 1) 

653 or _name_match(person.first, {token.text.upper()}, 1) 

654 ] 

655 

656 if valid_persons: 

657 replacement = "(%s)" % " or ".join( 

658 [ 

659 re.sub(r"[\(\)]", "", person.get_indicator()) 

660 for person in valid_persons 

661 ] 

662 ) 

663 yield doc.redact( 

664 start_char, 

665 end_char, 

666 replacement, 

667 auto_capitalize=False, 

668 autocorrect_article=False, 

669 info=info, 

670 ) 

671 

672 

673def mask( 

674 locale: Locale, 

675 narrative: str, 

676 persons: Iterable[PersonName], 

677 officers: Iterable[OfficerName], 

678 literals: dict[str, list[str]] | None = None, 

679) -> List[Redaction]: 

680 """Apply masking and formatting to narrative text. 

681 

682 :param narrative: Incident report text 

683 :param persons: List of names of people appearing in text 

684 :param OfficerName: List of names of officers appearing in text 

685 :param literals: Optional dictionary of custom lists to extend redaction 

686 :returns: List of redactions 

687 """ 

688 doc = SourceText(narrative) 

689 

690 return list( 

691 itertools.chain( 

692 mask_person(doc, officers, "officer"), 

693 mask_person(doc, persons, "person"), 

694 mask_street_address(doc), 

695 mask_district(doc, locale), 

696 mask_known_street_name(doc, locale), 

697 mask_presumed_street_name(doc), 

698 mask_neighborhood(doc, locale), 

699 mask_skin_color(doc), 

700 mask_hair_style(doc), 

701 mask_hair_color(doc), 

702 mask_eye_color(doc), 

703 mask_appearance_list(doc), 

704 mask_race_abbrev(doc), 

705 mask_race(doc), 

706 mask_race_correlated_feature(doc), 

707 mask_country(doc), 

708 mask_language(doc), 

709 mask_nationality(doc), 

710 mask_person_fuzzy(doc, persons, "person"), 

711 mask_other_literals(doc, literals), 

712 ) 

713 ) 

714 

715 

716def merge_annotations(annotations, narrative: str) -> List[Redaction]: 

717 """Merge 'person' annotations that contain the same text and info 

718 if they are only separated by a single white space 

719 

720 e.g. "(S1) (S1)" -> "(S1)" 

721 :param annotations: unsorted list of annotations 

722 :param narrative: Incident report text 

723 :returns: reverse sorted list of merged annotations 

724 """ 

725 if not annotations or len(annotations) <= 1: 

726 return annotations 

727 

728 # order redactions by character number, last to first 

729 annotations.sort(key=lambda x: x.start, reverse=True) 

730 

731 final_annotations = list[Redaction]() 

732 end_annotation = annotations[0] 

733 

734 for annotation in annotations[1:]: 

735 if ( 

736 end_annotation.start - annotation.end <= 1 

737 and end_annotation.text == annotation.text 

738 and end_annotation.info == annotation.info 

739 and end_annotation.info == "person" 

740 and re.match(r"\s", narrative[annotation.end : end_annotation.start]) 

741 ): 

742 end_annotation.start = annotation.start 

743 else: 

744 final_annotations.append(end_annotation) 

745 end_annotation = annotation 

746 final_annotations.append(end_annotation) 

747 

748 return final_annotations 

749 

750 

751def annotate( 

752 locale: Locale, 

753 narrative: str, 

754 persons: Iterable[dict], 

755 officers: Iterable[dict], 

756 redact_officers_from_text: bool = True, 

757 literals: dict[str, list[str]] | None = None, 

758) -> List[Redaction]: 

759 """Apply redaction tool and formatting to narrative text. 

760 

761 :param locale: location of narrative 

762 :param narrative: Incident report text 

763 :param persons: List of people appearing in text 

764 :param officers: List of officers appearing in text 

765 :param redact_officers_from_text: Whether to redact officers from text 

766 :param literals: Optional dictionary of custom lists to extend redaction 

767 :returns: redaction annotations 

768 """ 

769 person_types = set(locale.indicators.keys()) 

770 

771 persons = locale.filter_names(persons) 

772 formatted_persons = [PersonName(**person) for person in persons] 

773 formatted_officers = [OfficerName(**officer) for officer in officers] 

774 

775 # get_persons_from_narrative only applicable to sf right now, will refactor later 

776 formatted_persons += get_persons_from_narrative(narrative, 0, person_types) 

777 if redact_officers_from_text: 

778 formatted_officers += get_officers_from_narrative(narrative) 

779 

780 formatted_persons = PersonName.dedupe(formatted_persons, locale) 

781 formatted_officers = OfficerName.dedupe(formatted_officers, locale) 

782 

783 # create redactions 

784 annotations = mask( 

785 locale, 

786 narrative, 

787 persons=formatted_persons, 

788 officers=formatted_officers, 

789 literals=literals, 

790 ) 

791 return merge_annotations(annotations, narrative)