Coverage for blind_charging/masker.py: 90%

235 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-17 20:36 +0000

1import itertools 

2import re 

3from typing import DefaultDict, Dict, Generator, Iterable, List, Set, Union 

4 

5from .annotation import Redaction 

6from .locale import Locale 

7from .locale.const import USPS_STREET_ABBR 

8from .mask_const import ( 

9 APPEARANCE_LIST, 

10 COUNTRIES, 

11 EYE_COLORS, 

12 EYE_REF, 

13 GENERAL_COLORS, 

14 HAIR_ADJS, 

15 HAIR_COLORS, 

16 HAIR_REF, 

17 LANGUAGES, 

18 NATIONALITIES, 

19 PERSON_REF, 

20 RACE_ABBREV, 

21 RACE_FEATURES, 

22 RACE_WORDS, 

23 SENSITIVE_HAIR_REF, 

24 SKIN_COLORS, 

25 SLURS, 

26) 

27from .officer import OfficerName 

28from .person import PersonName, _name_match 

29from .re_util import re_literal_group 

30from .source_text import SourceText 

31from .text_processing import get_officers_from_narrative, get_persons_from_narrative 

32 

33AnyPerson = Union[OfficerName, PersonName] 

34 

35 

36# TODO(jnu): rewrite to generalize common behaviors. Really we only have three 

37# approaches: using PersonNames, using RegEx, and using NER. Generalize these 

38# as first-class rules that can be parameterized and applied. 

39 

40 

41def _re_literal_adj_list(adjectives: Iterable[str]) -> str: 

42 """ 

43 Create a RegExp pattern for matching a list of adjectives with literals. 

44 

45 :param adjectives: List of adjective literals 

46 :returns: RegExp pattern 

47 """ 

48 

49 adj_group = re_literal_group(adjectives) 

50 conj_group = re_literal_group(["and", "or"], capture=False) 

51 conj_sym_group = re_literal_group(["&", "/"], capture=False) 

52 det_group = re_literal_group(["a", "an", "the", "some", "any"], capture=False) 

53 

54 return ( 

55 # fmt: off 

56 r"\b{adj}(?:\s+,?\s*{adj},?)*" 

57 r"(?:(?:\s+{cnj}\s+|\s*{cnj_sym}\s*)(?:{det}\s+)?{adj}(?:\s+,?\s*{adj},?)*)?\b" 

58 # fmt: on 

59 ).format(adj=adj_group, cnj=conj_group, cnj_sym=conj_sym_group, det=det_group) 

60 

61 

62def _re_literal_noun_phrase(adjectives: Iterable[str], nouns: Iterable[str]) -> str: 

63 """Create a RegExp pattern for matching a simple noun phrase with literals. 

64 

65 Example: 

66 pattern = f(["green", "black"], ["frog", "toad"]) 

67 

68 This pattern would match "green frog" and "black toad" and even 

69 "green and black toad." 

70 

71 :param adjectives: List of adjective literals 

72 :param noun: List of noun literals 

73 :returns: RegExp pattern 

74 """ 

75 adj_list = _re_literal_adj_list(adjectives) 

76 noun_group = re_literal_group(nouns, name="noun") 

77 

78 return r"{adj}\s+{n}\b".format(adj=adj_list, n=noun_group) 

79 

80 

81def _redact_entities( 

82 doc: SourceText, literals: Iterable[str], placeholder: str, info: str = "" 

83) -> Generator[Redaction, None, None]: 

84 """Redact NLP entities matching the given list. 

85 

86 :param doc: Source text 

87 :param literals: List of literal strings to match 

88 :param placeholder: String to use in lieu of matched entities 

89 :param info: Comment to pass to redaction for tracing 

90 :yields: Redactions 

91 """ 

92 search_names = re_literal_group(literals, capture=False) 

93 # matches search names lazily to allow for longest search name match 

94 search_pattern = r"(.*?\s+)??\b{}\b(\s+.*)?".format(search_names) 

95 search_re = re.compile(search_pattern, re.IGNORECASE) 

96 

97 for ent in doc.nlp.ents[::-1]: 

98 if not doc.can_redact(ent.start_char, ent.end_char): 

99 continue 

100 m = search_re.match(ent.text) 

101 if m: 

102 start = ent.start_char 

103 end = ent.end_char 

104 pfx = m.group(1) or "" 

105 sfx = m.group(2) or "" 

106 replacement = "{}[{}]{}".format(pfx, placeholder, sfx) 

107 yield doc.redact(start, end, replacement, info=info) 

108 

109 

110def _redact_words( 

111 doc: SourceText, literals: Iterable[str], placeholder: str, info: str = "" 

112) -> Generator[Redaction, None, None]: 

113 """Redact words as tokenized by NLP. 

114 

115 :param doc: Source text 

116 :param literals: List of literal strings to match 

117 :param placeholder: String to use in lieu of matching words 

118 :param info: Comment to pass to redaction for tracing 

119 :yields: Redaction 

120 """ 

121 candidates = set(literals) 

122 replacement = "[{}]".format(placeholder) 

123 

124 for word in list(doc.nlp)[::-1]: 

125 start_char = word.idx 

126 end_char = start_char + len(word) 

127 if not doc.can_redact(start_char, end_char): 

128 continue 

129 if word.text in candidates: 

130 yield doc.redact(start_char, end_char, replacement, info=info) 

131 

132 

133def mask_skin_color( 

134 doc: SourceText, placeholder: str = "race/ethnicity" 

135) -> Generator[Redaction, None, None]: 

136 """Generate redactions for words used to describe skin color. 

137 

138 E.g., "black person" -> "[race/ethnicity] person" 

139 

140 NOTE: There may be overlap here with rules that deal with ethnicity 

141 directly. 

142 

143 :param doc: Source text 

144 :param placeholder: String to use in lieu of skin color words. 

145 :yields: Redactions 

146 """ 

147 pattern = _re_literal_noun_phrase(SKIN_COLORS | RACE_WORDS, PERSON_REF) 

148 skin_color_re = re.compile(pattern, re.IGNORECASE) 

149 

150 for match in skin_color_re.finditer(doc.text): 

151 start, end = match.span() 

152 replacement = "[{}] {}".format(placeholder, match.group("noun")) 

153 yield doc.redact(start, end, replacement, info="skin color") 

154 

155 

156def mask_hair_color( 

157 doc: SourceText, placeholder: str = "color" 

158) -> Generator[Redaction, None, None]: 

159 """Generate redactions for hair color. 

160 

161 E.g., "red hair" -> "[color] hair" 

162 

163 :param doc: Source text 

164 :param placeholder: String to use in lieu of color word 

165 :yields: Redactions 

166 """ 

167 hair_colors = GENERAL_COLORS | HAIR_COLORS 

168 pattern = _re_literal_noun_phrase(hair_colors, HAIR_REF) 

169 hair_color_re = re.compile(pattern, re.IGNORECASE) 

170 

171 for match in hair_color_re.finditer(doc.text): 

172 start, end = match.span() 

173 replacement = "[{}] {}".format(placeholder, match.group("noun")) 

174 yield doc.redact(start, end, replacement, info="hair color") 

175 

176 

177def mask_hair_style( 

178 doc: SourceText, placeholder: str = "hairstyle" 

179) -> Generator[Redaction, None, None]: 

180 """Generate redactions for hair styles. 

181 

182 E.g., "black short afro hair" -> "[hairstyle] hair" 

183 

184 :param doc: Source text 

185 :param placeholder: String to use in lieu of hair style 

186 :yields: Redaction 

187 """ 

188 hairstyle_adjs = SENSITIVE_HAIR_REF | HAIR_ADJS | GENERAL_COLORS | HAIR_COLORS 

189 hair_nouns = SENSITIVE_HAIR_REF | HAIR_REF 

190 replacement = "[{}] hair".format(placeholder) 

191 

192 for pattern in [ 

193 _re_literal_noun_phrase(hairstyle_adjs, hair_nouns), 

194 re_literal_group(SENSITIVE_HAIR_REF), 

195 ]: 

196 hairstyle_re = re.compile(pattern, re.IGNORECASE) 

197 for match in hairstyle_re.finditer(doc.text): 

198 start, end = match.span() 

199 yield doc.redact(start, end, replacement, info="hair style") 

200 

201 

202def mask_eye_color( 

203 doc: SourceText, placeholder: str = "color" 

204) -> Generator[Redaction, None, None]: 

205 """Generate redactions for eye color. 

206 

207 E.g., "blue eyes" -> "[color] eyes" 

208 

209 :param doc: Source text 

210 :param placeholder: String to use in lieu of color word 

211 :yields: Redactions 

212 """ 

213 eye_colors = GENERAL_COLORS | EYE_COLORS 

214 pattern = _re_literal_noun_phrase(eye_colors, EYE_REF) 

215 eye_color_re = re.compile(pattern, re.IGNORECASE) 

216 

217 for match in eye_color_re.finditer(doc.text): 

218 start, end = match.span() 

219 replacement = "[{}] {}".format(placeholder, match.group("noun")) 

220 yield doc.redact(start, end, replacement, info="eye color") 

221 

222 

223def mask_country( 

224 doc: SourceText, placeholder: str = "country" 

225) -> Generator[Redaction, None, None]: 

226 """Generate redactions for country names. 

227 

228 E.g., "Burundi" -> "[country]" 

229 

230 :param doc: Source text 

231 :param placeholder: String to use in lieu of country name 

232 :yields: Redactions 

233 """ 

234 yield from _redact_entities(doc, COUNTRIES, placeholder, info="country") 

235 yield from _redact_words(doc, COUNTRIES, placeholder, info="country") 

236 

237 

238def mask_language( 

239 doc: SourceText, placeholder: str = "language" 

240) -> Generator[Redaction, None, None]: 

241 """Generate redactions for nationalities. 

242 

243 E.g., "Spanish" -> "[language]" 

244 

245 :param doc: Source text 

246 :param placeholder: String to use in lieu of language 

247 :yields: Redactions 

248 """ 

249 yield from _redact_entities(doc, LANGUAGES, placeholder, info="language") 

250 yield from _redact_words(doc, LANGUAGES, placeholder, info="language") 

251 

252 

253def mask_nationality( 

254 doc: SourceText, placeholder: str = "nationality/ethnicity" 

255) -> Generator[Redaction, None, None]: 

256 """Generate redactions for nationalities. 

257 

258 E.g., "Mexican" -> "[nationality/ethnicity]" 

259 

260 :param doc: Source text 

261 :param placeholder: String to use in lieu of nationality 

262 :yields: Redactions 

263 """ 

264 # NOTE(acw): Tried using spacy's NER classifier alone here, but it would 

265 # too often classify irrelevant words (e.g., "5/18/2019" or "Silver Honda") 

266 # as languages or locations. 

267 yield from _redact_entities(doc, NATIONALITIES, placeholder, info="nationality") 

268 yield from _redact_words(doc, NATIONALITIES, placeholder, info="nationality") 

269 

270 

271def mask_race( 

272 doc: SourceText, placeholder: str = "race/ethnicity" 

273) -> Generator[Redaction, None, None]: 

274 """Generate redactions for words that directly indicate race. 

275 

276 E.g., "African American" -> "[race/ethnicity]" 

277 

278 :param doc: Source text 

279 :param placeholder: String to use in lieu of race/ethnicity 

280 :yields: Redactions 

281 """ 

282 pattern = _re_literal_adj_list(RACE_WORDS | SLURS) 

283 race_re = re.compile(pattern, re.IGNORECASE) 

284 replacement = "[{}]".format(placeholder) 

285 

286 for match in race_re.finditer(doc.text): 

287 start, end = match.span() 

288 yield doc.redact(start, end, replacement, info="race") 

289 

290 

291def mask_race_correlated_feature( 

292 doc: SourceText, placeholder: str = "physical description" 

293) -> Generator[Redaction, None, None]: 

294 """Generate redactions for feature that are highly correlated with race 

295 without context. 

296 

297 E.g., "We saw a blonde" -> "We saw a [physical description]" 

298 

299 :param doc: Source text 

300 :param placeholder: String to use in lieu of race-correlated features 

301 :yields: Redactions 

302 """ 

303 feature_group = re_literal_group(RACE_FEATURES) 

304 pattern = r"\b{}\b".format(feature_group) 

305 feature_re = re.compile(pattern, re.IGNORECASE) 

306 replacement = "[{}]".format(placeholder) 

307 

308 for match in feature_re.finditer(doc.text): 

309 start, end = match.span() 

310 yield doc.redact(start, end, replacement, info="race") 

311 

312 

313def mask_race_abbrev( 

314 doc: SourceText, placeholder: str = "race/ethnicity" 

315) -> Generator[Redaction, None, None]: 

316 """Generate redactions for abbreviated words that directly indicate race. 

317 

318 E.g., "AMA" -> "[race/ethnicity] male adult" 

319 

320 :param doc: Source text 

321 :param placeholder: String to use in lieu of race/ethnicity 

322 :yields: Redactions 

323 """ 

324 race_group = RACE_ABBREV 

325 pattern = r"(?<=\b){}s?(?=\b)".format(race_group) 

326 race_re = re.compile(pattern) # dont ignore case 

327 

328 sex_dict = {"F": "female", "M": "male"} 

329 age_dict = {"A": "adult", "J": "juvenile"} 

330 

331 for match in race_re.finditer(doc.text): 

332 start, end = match.span() 

333 # insert female/male, adult/juvenile depending on 2nd and 3rd groups 

334 replacement = "[{}] {} {}".format( 

335 placeholder, sex_dict.get(match.group(2)), age_dict.get(match.group(3)) 

336 ) 

337 yield doc.redact(start, end, replacement, info="race") 

338 

339 

340def mask_appearance_list( 

341 doc: SourceText, placeholder: str = "color" 

342) -> Generator[Redaction, None, None]: 

343 """Generate redactions for words in list format that directly indicate race. 

344 

345 E.g., "Race: Hispanic" -> "Race: [race/ethnicity]" 

346 E.g., "Hair: Black" -> "Hair: [color]" 

347 

348 :param doc: Source text 

349 :param placeholder: String to use in lieu of feature 

350 :yields: Redactions 

351 """ 

352 color_group = _re_literal_adj_list( 

353 SKIN_COLORS | HAIR_COLORS | HAIR_ADJS | EYE_COLORS | GENERAL_COLORS 

354 ) 

355 appearance_group = re_literal_group(APPEARANCE_LIST, name="noun") 

356 pattern = r"{}:\s*{}".format(appearance_group, color_group) 

357 appearance_list_re = re.compile(pattern, re.IGNORECASE) 

358 

359 for match in appearance_list_re.finditer(doc.text): 

360 if match.group("noun").lower() in ["race", "complexion"]: 

361 placeholder = "race/ethnicity" 

362 info = "race" 

363 elif match.group("noun") == "eyes": 

364 info = "eye color" 

365 elif match.group("noun") == "hair": 

366 info = "hair color" 

367 else: 

368 info = "appearance list" 

369 

370 start, end = match.span() 

371 replacement = "{}: [{}]".format(match.group("noun"), placeholder) 

372 yield doc.redact(start, end, replacement, info=info) 

373 

374 

375def mask_street_address( 

376 doc: SourceText, placeholder: str = "location" 

377) -> Generator[Redaction, None, None]: 

378 """Generate redactions for street addresses. 

379 

380 E.g., "123 Maple St." -> "[location] St." 

381 

382 :param doc: Source text 

383 :param placeholder: Text to use in lieu of literal street address 

384 :yields: Redactions 

385 """ 

386 endings_group = re_literal_group(USPS_STREET_ABBR) 

387 street_addr_re = re.compile( 

388 r"(?:\d{1,5} [\w\s]{1,20}) (" + endings_group + r"\.?)\W?(?=\s|$)", 

389 re.IGNORECASE, 

390 ) 

391 

392 # Avoid matching false street locations: 

393 # e.g. 30 mph, #2 lane 

394 bad_patterns_re = re.compile( 

395 r"\d{1,3}\s?mph\b|\b#?\d\s?([nesw]/?b\s?)?lane\b", # speed | lane in road 

396 re.IGNORECASE, 

397 ) 

398 

399 for match in street_addr_re.finditer(doc.text): 

400 matched_text = match.group(0) 

401 if bad_patterns_re.search(matched_text): 

402 continue 

403 

404 start, end = match.span() 

405 replacement = "[{}] {}".format(placeholder, match.group(1)) 

406 yield doc.redact(start, end, replacement, info="street address") 

407 

408 

409def mask_district( 

410 doc: SourceText, locale: Locale, placeholder: str = "district" 

411) -> Generator[Redaction, None, None]: 

412 """Generate redactions for police precincts. 

413 

414 :param doc: Source text 

415 :param locale: Locale to use for masking 

416 :param placeholder: Text to use in lieu of literal district name 

417 :yields: Redactions 

418 """ 

419 for match in locale.match_district(doc.text): 

420 start, end = match.span() 

421 sfx = (match.group(2) or "").lower() 

422 # Avoid adding suffix if it'd be awkwardly redundant, as in the case 

423 # of "[district] district" 

424 sfx = "" if sfx == placeholder else sfx 

425 replacement = "[{}]".format(placeholder) 

426 if sfx: 

427 replacement += " " + sfx 

428 yield doc.redact(start, end, replacement, info="district name") 

429 

430 

431def mask_presumed_street_name( 

432 doc: SourceText, placeholder: str = "street" 

433) -> Generator[Redaction, None, None]: 

434 """Generate redactions for entities that look like street names. 

435 

436 E.g., "Maple St." -> "[street] St." 

437 

438 :param doc: Source text 

439 :param placeholder: Text to use in lieu of street name 

440 :yields: Redactions 

441 """ 

442 ending_variants = sum( 

443 [[abbr, abbr.capitalize(), abbr.upper()] for abbr in USPS_STREET_ABBR], 

444 list[str](), 

445 ) 

446 street_endings = re_literal_group(ending_variants, capture=False) 

447 street_name_pattern = ( 

448 r"(?:(?:\d+|[A-Z])[A-Za-z\']*\s+)+" 

449 + r"(%s\.?)" % street_endings 

450 + r"(?=[,\/#!$%\^&\*;:{}=\-_`~()\s])" 

451 ) 

452 # Last pattern matches any `\b` except `\.` (matched in second pattern) 

453 # This keeps the period (e.g. in "St.") in the placeholder 

454 # NOTE(jnu): this is not case insensitive; the point is to use the 

455 # capitalization structure to infer words that might constitute a street 

456 # name. 

457 street_name_re = re.compile(street_name_pattern) 

458 

459 # Avoid matching false street names: 

460 # e.g. EB lane, E/B lane, #2 lane (on the freeway) 

461 bad_patterns_re = re.compile(r"\b(#?\d\s)?([nesw]/?b\s?)?lane\b", re.IGNORECASE) 

462 

463 for match in street_name_re.finditer(doc.text): 

464 matched_text = match.group(0) 

465 if bad_patterns_re.search(matched_text): 

466 continue 

467 

468 start, end = match.span() 

469 replacement = "[{}] {}".format(placeholder, match.group(1)) 

470 yield doc.redact(start, end, replacement, info="presumed street name") 

471 

472 

473def mask_known_street_name( 

474 doc: SourceText, locale: Locale, placeholder: str = "street" 

475) -> Generator[Redaction, None, None]: 

476 """Generate redactions for known streets in the city. 

477 

478 E.g., "Arguello and Euclid" -> "[street] and [street]" 

479 

480 :param doc: Source text 

481 :param locale: Locale to use for masking 

482 :param placeholder: Text to use in lieu of street name 

483 :yields: Redactions 

484 """ 

485 for match in locale.match_street_name(doc.text): 

486 start, end = match.span() 

487 replacement = "[{placeholder}]{conj}[{placeholder}]".format( 

488 placeholder=placeholder, conj=match.group("conj") 

489 ) 

490 yield doc.redact(start, end, replacement, info="known street name") 

491 

492 

493def mask_neighborhood( 

494 doc: SourceText, locale: Locale, placeholder: str = "neighborhood" 

495) -> Generator[Redaction, None, None]: 

496 """Generate redactions for neighborhoods in the city. 

497 

498 E.g., "Parkside" -> "[neighborhood]" 

499 

500 :param doc: Source text 

501 :param locale: Locale to use to perform masking 

502 :param placeholder: Text to use in lieu of neighborhood name 

503 :yields: Redactions 

504 """ 

505 # TODO(jnu): improve Locale API for matching these 

506 yield from _redact_entities( 

507 doc, locale.neighborhoods, placeholder, info="neighborhood" 

508 ) 

509 

510 

511def _create_person_name_map(persons: Iterable[AnyPerson]) -> Dict[str, Set[AnyPerson]]: 

512 """Create a map from surface name representations to persons. 

513 

514 The map connects the surface representations of a human name (such as 

515 "John P. Smith") to the PersonName instances that this name could refer to. 

516 In most cases this should be unique, however there may be ambiguous cases 

517 such as "J. Smith" that might refer to multiple individuals. 

518 

519 :param persons: List of person references 

520 :returns: Map from names to person references 

521 """ 

522 m = DefaultDict[str, Set[AnyPerson]](set) 

523 

524 for p in persons: 

525 for s in p.name_rep(): 

526 m[s].add(p) 

527 

528 return dict(m) 

529 

530 

531def mask_person( 

532 doc: SourceText, 

533 persons: Iterable[AnyPerson], 

534 info: str, 

535) -> Generator[Redaction, None, None]: 

536 """Generate a list of redactions for the persons given in the input. 

537 

538 :param doc: Source text 

539 :param persons: List of person references to redact 

540 :param annotations: List of existing annotations (passed to avoid adding 

541 conflicting annotations on a range) 

542 :yields: Redaction instances 

543 """ 

544 person_signs = _create_person_name_map(persons) 

545 

546 # Process surface representations of names in order of longest to shortest. 

547 # This means the longest names will be replaced first, which should help to 

548 # avoid ambiguity. 

549 sorted_signs = sorted(person_signs.items(), key=lambda x: len(x[0]), reverse=True) 

550 

551 for signifier, signified in sorted_signs: 

552 # Ambiguous references: 

553 pattern = re.compile(signifier, re.IGNORECASE) 

554 ordered_signified = sorted(signified, key=lambda a: a.get_indicator()) 

555 if info == "officer": 

556 # replacement as "Officer #1 or Officer #2" 

557 codename = " or ".join([p.get_indicator() for p in ordered_signified]) 

558 elif info == "person": 

559 # replacement as "(PERSON_1 or PERSON_2)"" rather than "(PERSON_1) or (PERSON_2)"" 

560 codename = "(%s)" % " or ".join( 

561 [re.sub(r"[\(\)]", "", p.get_indicator()) for p in ordered_signified] 

562 ) 

563 

564 for match in pattern.finditer(doc.text): 

565 replacement = codename 

566 start, end = match.span() 

567 # Special case: the rare terminal-apostrophe possessive, such as 

568 # "Moses'" where the correct redaction synthetically adds the 's. 

569 # TODO(jnu): probably better to handle this where we handle the 

570 # indefinite article redaction, in SourceText. 

571 if doc.text[end : end + 2] == "' ": 

572 replacement = codename + "'s" 

573 end += 1 

574 

575 # TODO(jnu): clean up coloring and classing 

576 ordered_signified[0] 

577 yield doc.redact( 

578 start, 

579 end, 

580 replacement, 

581 auto_capitalize=False, 

582 autocorrect_article=False, 

583 info=info, 

584 ) 

585 

586 

587def mask_person_fuzzy( 

588 doc: SourceText, 

589 persons: Iterable[PersonName], 

590 info: str, 

591) -> Generator[Redaction, None, None]: 

592 """Generate a list of redactions for the persons given in the input 

593 by redacting proper nouns in the text which are similar to last names in 

594 persons. 

595 

596 :param doc: Source text 

597 :param persons: List of person references to redact 

598 :param annotations: List of existing annotations (passed to avoid adding 

599 conflicting annotations on a range) 

600 :yields: Redaction instances 

601 """ 

602 

603 min_character_limit = 5 

604 propn_tokens = { 

605 token 

606 for token in doc.nlp 

607 if token.pos_ == "PROPN" and len(token) > min_character_limit 

608 } 

609 

610 for token in propn_tokens: 

611 start_char = token.idx 

612 end_char = start_char + len(token) 

613 

614 if not doc.can_redact(start_char, end_char): 

615 continue 

616 else: 

617 valid_persons = [ 

618 person 

619 for person in persons 

620 if _name_match({f"{person.first} {person.last}"}, {token.text.upper()}) 

621 or _name_match(person.last, {token.text.upper()}, 1) 

622 or _name_match(person.first, {token.text.upper()}, 1) 

623 ] 

624 

625 if valid_persons: 

626 replacement = "(%s)" % " or ".join( 

627 [ 

628 re.sub(r"[\(\)]", "", person.get_indicator()) 

629 for person in valid_persons 

630 ] 

631 ) 

632 yield doc.redact( 

633 start_char, 

634 end_char, 

635 replacement, 

636 auto_capitalize=False, 

637 autocorrect_article=False, 

638 info=info, 

639 ) 

640 

641 

642def mask( 

643 locale: Locale, 

644 narrative: str, 

645 persons: Iterable[PersonName], 

646 officers: Iterable[OfficerName], 

647) -> List[Redaction]: 

648 """Apply masking and formatting to narrative text. 

649 

650 :param narrative: Incident report text 

651 :param persons: List of names of people appearing in text 

652 :param OfficerName: List of names of officers appearing in text 

653 :returns: List of redactions 

654 """ 

655 doc = SourceText(narrative) 

656 

657 return list( 

658 itertools.chain( 

659 mask_person(doc, officers, "officer"), 

660 mask_person(doc, persons, "person"), 

661 mask_street_address(doc), 

662 mask_district(doc, locale), 

663 mask_known_street_name(doc, locale), 

664 mask_presumed_street_name(doc), 

665 mask_neighborhood(doc, locale), 

666 mask_skin_color(doc), 

667 mask_hair_style(doc), 

668 mask_hair_color(doc), 

669 mask_eye_color(doc), 

670 mask_appearance_list(doc), 

671 mask_race_abbrev(doc), 

672 mask_race(doc), 

673 mask_race_correlated_feature(doc), 

674 mask_country(doc), 

675 mask_language(doc), 

676 mask_nationality(doc), 

677 mask_person_fuzzy(doc, persons, "person"), 

678 ) 

679 ) 

680 

681 

682def merge_annotations(annotations, narrative: str) -> List[Redaction]: 

683 """Merge 'person' annotations that contain the same text and info 

684 if they are only separated by a single white space 

685 

686 e.g. "(S1) (S1)" -> "(S1)" 

687 :param annotations: unsorted list of annotations 

688 :param narrative: Incident report text 

689 :returns: reverse sorted list of merged annotations 

690 """ 

691 if not annotations or len(annotations) <= 1: 

692 return annotations 

693 

694 # order redactions by character number, last to first 

695 annotations.sort(key=lambda x: x.start, reverse=True) 

696 

697 final_annotations = list[Redaction]() 

698 end_annotation = annotations[0] 

699 

700 for annotation in annotations[1:]: 

701 if ( 

702 end_annotation.start - annotation.end <= 1 

703 and end_annotation.text == annotation.text 

704 and end_annotation.info == annotation.info 

705 and end_annotation.info == "person" 

706 and re.match(r"\s", narrative[annotation.end : end_annotation.start]) 

707 ): 

708 end_annotation.start = annotation.start 

709 else: 

710 final_annotations.append(end_annotation) 

711 end_annotation = annotation 

712 final_annotations.append(end_annotation) 

713 

714 return final_annotations 

715 

716 

717def annotate( 

718 locale: Locale, 

719 narrative: str, 

720 persons: Iterable[dict], 

721 officers: Iterable[dict], 

722 redact_officers_from_text=True, 

723) -> List[Redaction]: 

724 """Apply redaction tool and formatting to narrative text. 

725 

726 :param locale: location of narrative 

727 :param narrative: Incident report text 

728 :param persons: List of people appearing in text 

729 :param officers: List of officers appearing in text 

730 :returns: redaction annotations 

731 """ 

732 person_types = set(locale.indicators.keys()) 

733 

734 persons = locale.filter_names(persons) 

735 formatted_persons = [PersonName(**person) for person in persons] 

736 formatted_officers = [OfficerName(**officer) for officer in officers] 

737 

738 # get_persons_from_narrative only applicable to sf right now, will refactor later 

739 formatted_persons += get_persons_from_narrative(narrative, 0, person_types) 

740 if redact_officers_from_text: 

741 formatted_officers += get_officers_from_narrative(narrative) 

742 

743 formatted_persons = PersonName.dedupe(formatted_persons, locale) 

744 formatted_officers = OfficerName.dedupe(formatted_officers, locale) 

745 

746 # create redactions 

747 annotations = mask( 

748 locale, narrative, persons=formatted_persons, officers=formatted_officers 

749 ) 

750 return merge_annotations(annotations, narrative)