Coverage for blind_charging/masker.py: 90%

1import itertools

2import re

3from typing import DefaultDict, Dict, Generator, Iterable, List, Set, Union

5from .annotation import Redaction

6from .locale import Locale

7from .locale.const import USPS_STREET_ABBR

8from .mask_const import (

9 APPEARANCE_LIST,

10 COUNTRIES,

11 EYE_COLORS,

12 EYE_REF,

13 GENERAL_COLORS,

14 HAIR_ADJS,

15 HAIR_COLORS,

16 HAIR_REF,

17 LANGUAGES,

18 NATIONALITIES,

19 PERSON_REF,

20 RACE_ABBREV,

21 RACE_FEATURES,

22 RACE_WORDS,

23 SENSITIVE_HAIR_REF,

24 SKIN_COLORS,

25)

26from .officer import OfficerName

27from .person import PersonName, _name_match

28from .re_util import re_literal_group

29from .source_text import SourceText

30from .text_processing import get_officers_from_narrative, get_persons_from_narrative

32AnyPerson = Union[OfficerName, PersonName]

35# TODO(jnu): rewrite to generalize common behaviors. Really we only have three

36# approaches: using PersonNames, using RegEx, and using NER. Generalize these

37# as first-class rules that can be parameterized and applied.

40def _re_literal_adj_list(adjectives: Iterable[str]) -> str:

41 """

42 Create a RegExp pattern for matching a list of adjectives with literals.

44 :param adjectives: List of adjective literals

45 :returns: RegExp pattern

46 """

48 adj_group = re_literal_group(adjectives)

49 conj_group = re_literal_group(["and", "or"], capture=False)

50 conj_sym_group = re_literal_group(["&", "/"], capture=False)

51 det_group = re_literal_group(["a", "an", "the", "some", "any"], capture=False)

53 return (

54 # fmt: off

55 r"\b{adj}(?:\s+,?\s*{adj},?)*"

56 r"(?:(?:\s+{cnj}\s+|\s*{cnj_sym}\s*)(?:{det}\s+)?{adj}(?:\s+,?\s*{adj},?)*)?\b"

57 # fmt: on

58 ).format(adj=adj_group, cnj=conj_group, cnj_sym=conj_sym_group, det=det_group)

61def _re_literal_noun_phrase(adjectives: Iterable[str], nouns: Iterable[str]) -> str:

62 """Create a RegExp pattern for matching a simple noun phrase with literals.

64 Example:

65 pattern = f(["green", "black"], ["frog", "toad"])

67 This pattern would match "green frog" and "black toad" and even

68 "green and black toad."

70 :param adjectives: List of adjective literals

71 :param noun: List of noun literals

72 :returns: RegExp pattern

73 """

74 adj_list = _re_literal_adj_list(adjectives)

75 noun_group = re_literal_group(nouns, name="noun")

77 return r"{adj}\s+{n}\b".format(adj=adj_list, n=noun_group)

80def _redact_entities(

81 doc: SourceText, literals: Iterable[str], placeholder: str, info: str = ""

82) -> Generator[Redaction, None, None]:

83 """Redact NLP entities matching the given list.

85 :param doc: Source text

86 :param literals: List of literal strings to match

87 :param placeholder: String to use in lieu of matched entities

88 :param info: Comment to pass to redaction for tracing

89 :yields: Redactions

90 """

91 search_names = re_literal_group(literals, capture=False)

92 # matches search names lazily to allow for longest search name match

93 search_pattern = r"(.*?\s+)??\b{}\b(\s+.*)?".format(search_names)

94 search_re = re.compile(search_pattern, re.IGNORECASE)

96 for ent in doc.nlp.ents[::-1]:

97 if not doc.can_redact(ent.start_char, ent.end_char):

98 continue

99 m = search_re.match(ent.text)

100 if m:

101 start = ent.start_char

102 end = ent.end_char

103 pfx = m.group(1) or ""

104 sfx = m.group(2) or ""

105 replacement = "{}[{}]{}".format(pfx, placeholder, sfx)

106 yield doc.redact(start, end, replacement, info=info)

107

108

109def _redact_words(

110 doc: SourceText, literals: Iterable[str], placeholder: str, info: str = ""

111) -> Generator[Redaction, None, None]:

112 """Redact words as tokenized by NLP.

113

114 :param doc: Source text

115 :param literals: List of literal strings to match

116 :param placeholder: String to use in lieu of matching words

117 :param info: Comment to pass to redaction for tracing

118 :yields: Redaction

119 """

120 candidates = set(literals)

121 replacement = "[{}]".format(placeholder)

122

123 for word in list(doc.nlp)[::-1]:

124 start_char = word.idx

125 end_char = start_char + len(word)

126 if not doc.can_redact(start_char, end_char):

127 continue

128 if word.text in candidates:

129 yield doc.redact(start_char, end_char, replacement, info=info)

130

131

132def mask_skin_color(

133 doc: SourceText, placeholder: str = "race/ethnicity"

134) -> Generator[Redaction, None, None]:

135 """Generate redactions for words used to describe skin color.

136

137 E.g., "black person" -> "[race/ethnicity] person"

138

139 NOTE: There may be overlap here with rules that deal with ethnicity

140 directly.

141

142 :param doc: Source text

143 :param placeholder: String to use in lieu of skin color words.

144 :yields: Redactions

145 """

146 pattern = _re_literal_noun_phrase(SKIN_COLORS | RACE_WORDS, PERSON_REF)

147 skin_color_re = re.compile(pattern, re.IGNORECASE)

148

149 for match in skin_color_re.finditer(doc.text):

150 start, end = match.span()

151 replacement = "[{}] {}".format(placeholder, match.group("noun"))

152 yield doc.redact(start, end, replacement, info="skin color")

153

154

155def mask_hair_color(

156 doc: SourceText, placeholder: str = "color"

157) -> Generator[Redaction, None, None]:

158 """Generate redactions for hair color.

159

160 E.g., "red hair" -> "[color] hair"

161

162 :param doc: Source text

163 :param placeholder: String to use in lieu of color word

164 :yields: Redactions

165 """

166 hair_colors = GENERAL_COLORS | HAIR_COLORS

167 pattern = _re_literal_noun_phrase(hair_colors, HAIR_REF)

168 hair_color_re = re.compile(pattern, re.IGNORECASE)

169

170 for match in hair_color_re.finditer(doc.text):

171 start, end = match.span()

172 replacement = "[{}] {}".format(placeholder, match.group("noun"))

173 yield doc.redact(start, end, replacement, info="hair color")

174

175

176def mask_hair_style(

177 doc: SourceText, placeholder: str = "hairstyle"

178) -> Generator[Redaction, None, None]:

179 """Generate redactions for hair styles.

180

181 E.g., "black short afro hair" -> "[hairstyle] hair"

182

183 :param doc: Source text

184 :param placeholder: String to use in lieu of hair style

185 :yields: Redaction

186 """

187 hairstyle_adjs = SENSITIVE_HAIR_REF | HAIR_ADJS | GENERAL_COLORS | HAIR_COLORS

188 hair_nouns = SENSITIVE_HAIR_REF | HAIR_REF

189 replacement = "[{}] hair".format(placeholder)

190

191 for pattern in [

192 _re_literal_noun_phrase(hairstyle_adjs, hair_nouns),

193 re_literal_group(SENSITIVE_HAIR_REF),

194 ]:

195 hairstyle_re = re.compile(pattern, re.IGNORECASE)

196 for match in hairstyle_re.finditer(doc.text):

197 start, end = match.span()

198 yield doc.redact(start, end, replacement, info="hair style")

199

200

201def mask_eye_color(

202 doc: SourceText, placeholder: str = "color"

203) -> Generator[Redaction, None, None]:

204 """Generate redactions for eye color.

205

206 E.g., "blue eyes" -> "[color] eyes"

207

208 :param doc: Source text

209 :param placeholder: String to use in lieu of color word

210 :yields: Redactions

211 """

212 eye_colors = GENERAL_COLORS | EYE_COLORS

213 pattern = _re_literal_noun_phrase(eye_colors, EYE_REF)

214 eye_color_re = re.compile(pattern, re.IGNORECASE)

215

216 for match in eye_color_re.finditer(doc.text):

217 start, end = match.span()

218 replacement = "[{}] {}".format(placeholder, match.group("noun"))

219 yield doc.redact(start, end, replacement, info="eye color")

220

221

222def mask_country(

223 doc: SourceText, placeholder: str = "country"

224) -> Generator[Redaction, None, None]:

225 """Generate redactions for country names.

226

227 E.g., "Burundi" -> "[country]"

228

229 :param doc: Source text

230 :param placeholder: String to use in lieu of country name

231 :yields: Redactions

232 """

233 yield from _redact_entities(doc, COUNTRIES, placeholder, info="country")

234 yield from _redact_words(doc, COUNTRIES, placeholder, info="country")

235

236

237def mask_language(

238 doc: SourceText, placeholder: str = "language"

239) -> Generator[Redaction, None, None]:

240 """Generate redactions for nationalities.

241

242 E.g., "Spanish" -> "[language]"

243

244 :param doc: Source text

245 :param placeholder: String to use in lieu of language

246 :yields: Redactions

247 """

248 yield from _redact_entities(doc, LANGUAGES, placeholder, info="language")

249 yield from _redact_words(doc, LANGUAGES, placeholder, info="language")

250

251

252def mask_nationality(

253 doc: SourceText, placeholder: str = "nationality/ethnicity"

254) -> Generator[Redaction, None, None]:

255 """Generate redactions for nationalities.

256

257 E.g., "Mexican" -> "[nationality/ethnicity]"

258

259 :param doc: Source text

260 :param placeholder: String to use in lieu of nationality

261 :yields: Redactions

262 """

263 # NOTE(acw): Tried using spacy's NER classifier alone here, but it would

264 # too often classify irrelevant words (e.g., "5/18/2019" or "Silver Honda")

265 # as languages or locations.

266 yield from _redact_entities(doc, NATIONALITIES, placeholder, info="nationality")

267 yield from _redact_words(doc, NATIONALITIES, placeholder, info="nationality")

268

269

270def mask_race(

271 doc: SourceText, placeholder: str = "race/ethnicity"

272) -> Generator[Redaction, None, None]:

273 """Generate redactions for words that directly indicate race.

274

275 E.g., "African American" -> "[race/ethnicity]"

276

277 :param doc: Source text

278 :param placeholder: String to use in lieu of race/ethnicity

279 :yields: Redactions

280 """

281 pattern = _re_literal_adj_list(RACE_WORDS)

282 race_re = re.compile(pattern, re.IGNORECASE)

283 replacement = "[{}]".format(placeholder)

284

285 for match in race_re.finditer(doc.text):

286 start, end = match.span()

287 yield doc.redact(start, end, replacement, info="race")

288

289

290def mask_other_literals(

291 doc: SourceText,

292 literals: dict[str, list[str]] | None,

293) -> Generator[Redaction, None, None]:

294 """Generate redactions based on custom lists of literal words.

295

296 Example:

297 literals = {

298 "district": ["lake district", "park district"],

299 }

300

301 "The suspect was last seen the Park District" ->

302 "The suspect was last seen in the [district]"

303

304 :param doc: Source text

305 :param literals: Dictionary describing literal words to redact. Keys will

306 be used to substitute for each of the values in the associated list.

307 :yields: Redactions

308 """

309 if literals is None:

310 return

311

312 for literal, values in literals.items():

313 pattern = re_literal_group(values)

314 literal_re = re.compile(pattern, re.IGNORECASE)

315 replacement = "[{}]".format(literal)

316

317 for match in literal_re.finditer(doc.text):

318 start, end = match.span()

319 yield doc.redact(start, end, replacement, info=literal)

320

321

322def mask_race_correlated_feature(

323 doc: SourceText, placeholder: str = "physical description"

324) -> Generator[Redaction, None, None]:

325 """Generate redactions for feature that are highly correlated with race

326 without context.

327

328 E.g., "We saw a blonde" -> "We saw a [physical description]"

329

330 :param doc: Source text

331 :param placeholder: String to use in lieu of race-correlated features

332 :yields: Redactions

333 """

334 feature_group = re_literal_group(RACE_FEATURES)

335 pattern = r"\b{}\b".format(feature_group)

336 feature_re = re.compile(pattern, re.IGNORECASE)

337 replacement = "[{}]".format(placeholder)

338

339 for match in feature_re.finditer(doc.text):

340 start, end = match.span()

341 yield doc.redact(start, end, replacement, info="race")

342

343

344def mask_race_abbrev(

345 doc: SourceText, placeholder: str = "race/ethnicity"

346) -> Generator[Redaction, None, None]:

347 """Generate redactions for abbreviated words that directly indicate race.

348

349 E.g., "AMA" -> "[race/ethnicity] male adult"

350

351 :param doc: Source text

352 :param placeholder: String to use in lieu of race/ethnicity

353 :yields: Redactions

354 """

355 race_group = RACE_ABBREV

356 pattern = r"(?<=\b){}s?(?=\b)".format(race_group)

357 race_re = re.compile(pattern) # dont ignore case

358

359 sex_dict = {"F": "female", "M": "male"}

360 age_dict = {"A": "adult", "J": "juvenile"}

361

362 for match in race_re.finditer(doc.text):

363 start, end = match.span()

364 # insert female/male, adult/juvenile depending on 2nd and 3rd groups

365 replacement = "[{}] {} {}".format(

366 placeholder, sex_dict.get(match.group(2)), age_dict.get(match.group(3))

367 )

368 yield doc.redact(start, end, replacement, info="race")

369

370

371def mask_appearance_list(

372 doc: SourceText, placeholder: str = "color"

373) -> Generator[Redaction, None, None]:

374 """Generate redactions for words in list format that directly indicate race.

375

376 E.g., "Race: Hispanic" -> "Race: [race/ethnicity]"

377 E.g., "Hair: Black" -> "Hair: [color]"

378

379 :param doc: Source text

380 :param placeholder: String to use in lieu of feature

381 :yields: Redactions

382 """

383 color_group = _re_literal_adj_list(

384 SKIN_COLORS | HAIR_COLORS | HAIR_ADJS | EYE_COLORS | GENERAL_COLORS

385 )

386 appearance_group = re_literal_group(APPEARANCE_LIST, name="noun")

387 pattern = r"{}:\s*{}".format(appearance_group, color_group)

388 appearance_list_re = re.compile(pattern, re.IGNORECASE)

389

390 for match in appearance_list_re.finditer(doc.text):

391 if match.group("noun").lower() in ["race", "complexion"]:

392 placeholder = "race/ethnicity"

393 info = "race"

394 elif match.group("noun") == "eyes":

395 info = "eye color"

396 elif match.group("noun") == "hair":

397 info = "hair color"

398 else:

399 info = "appearance list"

400

401 start, end = match.span()

402 replacement = "{}: [{}]".format(match.group("noun"), placeholder)

403 yield doc.redact(start, end, replacement, info=info)

404

405

406def mask_street_address(

407 doc: SourceText, placeholder: str = "location"

408) -> Generator[Redaction, None, None]:

409 """Generate redactions for street addresses.

410

411 E.g., "123 Maple St." -> "[location] St."

412

413 :param doc: Source text

414 :param placeholder: Text to use in lieu of literal street address

415 :yields: Redactions

416 """

417 endings_group = re_literal_group(USPS_STREET_ABBR)

418 street_addr_re = re.compile(

419 r"(?:\d{1,5} [\w\s]{1,20}) (" + endings_group + r"\.?)\W?(?=\s|$)",

420 re.IGNORECASE,

421 )

422

423 # Avoid matching false street locations:

424 # e.g. 30 mph, #2 lane

425 bad_patterns_re = re.compile(

426 r"\d{1,3}\s?mph\b|\b#?\d\s?([nesw]/?b\s?)?lane\b", # speed | lane in road

427 re.IGNORECASE,

428 )

429

430 for match in street_addr_re.finditer(doc.text):

431 matched_text = match.group(0)

432 if bad_patterns_re.search(matched_text):

433 continue

434

435 start, end = match.span()

436 replacement = "[{}] {}".format(placeholder, match.group(1))

437 yield doc.redact(start, end, replacement, info="street address")

438

439

440def mask_district(

441 doc: SourceText, locale: Locale, placeholder: str = "district"

442) -> Generator[Redaction, None, None]:

443 """Generate redactions for police precincts.

444

445 :param doc: Source text

446 :param locale: Locale to use for masking

447 :param placeholder: Text to use in lieu of literal district name

448 :yields: Redactions

449 """

450 for match in locale.match_district(doc.text):

451 start, end = match.span()

452 sfx = (match.group(2) or "").lower()

453 # Avoid adding suffix if it'd be awkwardly redundant, as in the case

454 # of "[district] district"

455 sfx = "" if sfx == placeholder else sfx

456 replacement = "[{}]".format(placeholder)

457 if sfx:

458 replacement += " " + sfx

459 yield doc.redact(start, end, replacement, info="district name")

460

461

462def mask_presumed_street_name(

463 doc: SourceText, placeholder: str = "street"

464) -> Generator[Redaction, None, None]:

465 """Generate redactions for entities that look like street names.

466

467 E.g., "Maple St." -> "[street] St."

468

469 :param doc: Source text

470 :param placeholder: Text to use in lieu of street name

471 :yields: Redactions

472 """

473 ending_variants = sum(

474 [[abbr, abbr.capitalize(), abbr.upper()] for abbr in USPS_STREET_ABBR],

475 list[str](),

476 )

477 street_endings = re_literal_group(ending_variants, capture=False)

478 street_name_pattern = (

479 r"(?:(?:\d+|[A-Z])[A-Za-z\']*\s+)+"

480 + r"(%s\.?)" % street_endings

481 + r"(?=[,\/#!$%\^&\*;:{}=\-_`~()\s])"

482 )

483 # Last pattern matches any `\b` except `\.` (matched in second pattern)

484 # This keeps the period (e.g. in "St.") in the placeholder

485 # NOTE(jnu): this is not case insensitive; the point is to use the

486 # capitalization structure to infer words that might constitute a street

487 # name.

488 street_name_re = re.compile(street_name_pattern)

489

490 # Avoid matching false street names:

491 # e.g. EB lane, E/B lane, #2 lane (on the freeway)

492 bad_patterns_re = re.compile(r"\b(#?\d\s)?([nesw]/?b\s?)?lane\b", re.IGNORECASE)

493

494 for match in street_name_re.finditer(doc.text):

495 matched_text = match.group(0)

496 if bad_patterns_re.search(matched_text):

497 continue

498

499 start, end = match.span()

500 replacement = "[{}] {}".format(placeholder, match.group(1))

501 yield doc.redact(start, end, replacement, info="presumed street name")

502

503

504def mask_known_street_name(

505 doc: SourceText, locale: Locale, placeholder: str = "street"

506) -> Generator[Redaction, None, None]:

507 """Generate redactions for known streets in the city.

508

509 E.g., "Arguello and Euclid" -> "[street] and [street]"

510

511 :param doc: Source text

512 :param locale: Locale to use for masking

513 :param placeholder: Text to use in lieu of street name

514 :yields: Redactions

515 """

516 for match in locale.match_street_name(doc.text):

517 start, end = match.span()

518 replacement = "[{placeholder}]{conj}[{placeholder}]".format(

519 placeholder=placeholder, conj=match.group("conj")

520 )

521 yield doc.redact(start, end, replacement, info="known street name")

522

523

524def mask_neighborhood(

525 doc: SourceText, locale: Locale, placeholder: str = "neighborhood"

526) -> Generator[Redaction, None, None]:

527 """Generate redactions for neighborhoods in the city.

528

529 E.g., "Parkside" -> "[neighborhood]"

530

531 :param doc: Source text

532 :param locale: Locale to use to perform masking

533 :param placeholder: Text to use in lieu of neighborhood name

534 :yields: Redactions

535 """

536 # TODO(jnu): improve Locale API for matching these

537 yield from _redact_entities(

538 doc, locale.neighborhoods, placeholder, info="neighborhood"

539 )

540

541

542def _create_person_name_map(persons: Iterable[AnyPerson]) -> Dict[str, Set[AnyPerson]]:

543 """Create a map from surface name representations to persons.

544

545 The map connects the surface representations of a human name (such as

546 "John P. Smith") to the PersonName instances that this name could refer to.

547 In most cases this should be unique, however there may be ambiguous cases

548 such as "J. Smith" that might refer to multiple individuals.

549

550 :param persons: List of person references

551 :returns: Map from names to person references

552 """

553 m = DefaultDict[str, Set[AnyPerson]](set)

554

555 for p in persons:

556 for s in p.name_rep():

557 m[s].add(p)

558

559 return dict(m)

560

561

562def mask_person(

563 doc: SourceText,

564 persons: Iterable[AnyPerson],

565 info: str,

566) -> Generator[Redaction, None, None]:

567 """Generate a list of redactions for the persons given in the input.

568

569 :param doc: Source text

570 :param persons: List of person references to redact

571 :param annotations: List of existing annotations (passed to avoid adding

572 conflicting annotations on a range)

573 :yields: Redaction instances

574 """

575 person_signs = _create_person_name_map(persons)

576

577 # Process surface representations of names in order of longest to shortest.

578 # This means the longest names will be replaced first, which should help to

579 # avoid ambiguity.

580 sorted_signs = sorted(person_signs.items(), key=lambda x: len(x[0]), reverse=True)

581

582 for signifier, signified in sorted_signs:

583 # Ambiguous references:

584 pattern = re.compile(signifier, re.IGNORECASE)

585 ordered_signified = sorted(signified, key=lambda a: a.get_indicator())

586 if info == "officer":

587 # replacement as "Officer #1 or Officer #2"

588 codename = " or ".join([p.get_indicator() for p in ordered_signified])

589 elif info == "person":

590 # replacement as "(PERSON_1 or PERSON_2)"" rather than "(PERSON_1) or (PERSON_2)""

591 codename = "(%s)" % " or ".join(

592 [re.sub(r"[]", "", p.get_indicator()) for p in ordered_signified]

593 )

594

595 for match in pattern.finditer(doc.text):

596 replacement = codename

597 start, end = match.span()

598 # Special case: the rare terminal-apostrophe possessive, such as

599 # "Moses'" where the correct redaction synthetically adds the 's.

600 # TODO(jnu): probably better to handle this where we handle the

601 # indefinite article redaction, in SourceText.

602 if doc.text[end : end + 2] == "' ":

603 replacement = codename + "'s"

604 end += 1

605

606 # TODO(jnu): clean up coloring and classing

607 ordered_signified[0]

608 yield doc.redact(

609 start,

610 end,

611 replacement,

612 auto_capitalize=False,

613 autocorrect_article=False,

614 info=info,

615 )

616

617

618def mask_person_fuzzy(

619 doc: SourceText,

620 persons: Iterable[PersonName],

621 info: str,

622) -> Generator[Redaction, None, None]:

623 """Generate a list of redactions for the persons given in the input

624 by redacting proper nouns in the text which are similar to last names in

625 persons.

626

627 :param doc: Source text

628 :param persons: List of person references to redact

629 :param annotations: List of existing annotations (passed to avoid adding

630 conflicting annotations on a range)

631 :yields: Redaction instances

632 """

633

634 min_character_limit = 5

635 propn_tokens = {

636 token

637 for token in doc.nlp

638 if token.pos_ == "PROPN" and len(token) > min_character_limit

639 }

640

641 for token in propn_tokens:

642 start_char = token.idx

643 end_char = start_char + len(token)

644

645 if not doc.can_redact(start_char, end_char):

646 continue

647 else:

648 valid_persons = [

649 person

650 for person in persons

651 if _name_match({f"{person.first} {person.last}"}, {token.text.upper()})

652 or _name_match(person.last, {token.text.upper()}, 1)

653 or _name_match(person.first, {token.text.upper()}, 1)

654 ]

655

656 if valid_persons:

657 replacement = "(%s)" % " or ".join(

658 [

659 re.sub(r"[]", "", person.get_indicator())

660 for person in valid_persons

661 ]

662 )

663 yield doc.redact(

664 start_char,

665 end_char,

666 replacement,

667 auto_capitalize=False,

668 autocorrect_article=False,

669 info=info,

670 )

671

672

673def mask(

674 locale: Locale,

675 narrative: str,

676 persons: Iterable[PersonName],

677 officers: Iterable[OfficerName],

678 literals: dict[str, list[str]] | None = None,

679) -> List[Redaction]:

680 """Apply masking and formatting to narrative text.

681

682 :param narrative: Incident report text

683 :param persons: List of names of people appearing in text

684 :param OfficerName: List of names of officers appearing in text

685 :param literals: Optional dictionary of custom lists to extend redaction

686 :returns: List of redactions

687 """

688 doc = SourceText(narrative)

689

690 return list(

691 itertools.chain(

692 mask_person(doc, officers, "officer"),

693 mask_person(doc, persons, "person"),

694 mask_street_address(doc),

695 mask_district(doc, locale),

696 mask_known_street_name(doc, locale),

697 mask_presumed_street_name(doc),

698 mask_neighborhood(doc, locale),

699 mask_skin_color(doc),

700 mask_hair_style(doc),

701 mask_hair_color(doc),

702 mask_eye_color(doc),

703 mask_appearance_list(doc),

704 mask_race_abbrev(doc),

705 mask_race(doc),

706 mask_race_correlated_feature(doc),

707 mask_country(doc),

708 mask_language(doc),

709 mask_nationality(doc),

710 mask_person_fuzzy(doc, persons, "person"),

711 mask_other_literals(doc, literals),

712 )

713 )

714

715

716def merge_annotations(annotations, narrative: str) -> List[Redaction]:

717 """Merge 'person' annotations that contain the same text and info

718 if they are only separated by a single white space

719

720 e.g. "(S1) (S1)" -> "(S1)"

721 :param annotations: unsorted list of annotations

722 :param narrative: Incident report text

723 :returns: reverse sorted list of merged annotations

724 """

725 if not annotations or len(annotations) <= 1:

726 return annotations

727

728 # order redactions by character number, last to first

729 annotations.sort(key=lambda x: x.start, reverse=True)

730

731 final_annotations = list[Redaction]()

732 end_annotation = annotations[0]

733

734 for annotation in annotations[1:]:

735 if (

736 end_annotation.start - annotation.end <= 1

737 and end_annotation.text == annotation.text

738 and end_annotation.info == annotation.info

739 and end_annotation.info == "person"

740 and re.match(r"\s", narrative[annotation.end : end_annotation.start])

741 ):

742 end_annotation.start = annotation.start

743 else:

744 final_annotations.append(end_annotation)

745 end_annotation = annotation

746 final_annotations.append(end_annotation)

747

748 return final_annotations

749

750

751def annotate(

752 locale: Locale,

753 narrative: str,

754 persons: Iterable[dict],

755 officers: Iterable[dict],

756 redact_officers_from_text: bool = True,

757 literals: dict[str, list[str]] | None = None,

758) -> List[Redaction]:

759 """Apply redaction tool and formatting to narrative text.

760

761 :param locale: location of narrative

762 :param narrative: Incident report text

763 :param persons: List of people appearing in text

764 :param officers: List of officers appearing in text

765 :param redact_officers_from_text: Whether to redact officers from text

766 :param literals: Optional dictionary of custom lists to extend redaction

767 :returns: redaction annotations

768 """

769 person_types = set(locale.indicators.keys())

770

771 persons = locale.filter_names(persons)

772 formatted_persons = [PersonName(**person) for person in persons]

773 formatted_officers = [OfficerName(**officer) for officer in officers]

774

775 # get_persons_from_narrative only applicable to sf right now, will refactor later

776 formatted_persons += get_persons_from_narrative(narrative, 0, person_types)

777 if redact_officers_from_text:

778 formatted_officers += get_officers_from_narrative(narrative)

779

780 formatted_persons = PersonName.dedupe(formatted_persons, locale)

781 formatted_officers = OfficerName.dedupe(formatted_officers, locale)

782

783 # create redactions

784 annotations = mask(

785 locale,

786 narrative,

787 persons=formatted_persons,

788 officers=formatted_officers,

789 literals=literals,

790 )

791 return merge_annotations(annotations, narrative)