Coverage for blind_charging/masker.py: 90%
245 statements
« prev ^ index » next coverage.py v6.5.0, created at 2025-10-20 15:43 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2025-10-20 15:43 +0000
1import itertools
2import re
3from typing import DefaultDict, Dict, Generator, Iterable, List, Set, Union
5from .annotation import Redaction
6from .locale import Locale
7from .locale.const import USPS_STREET_ABBR
8from .mask_const import (
9 APPEARANCE_LIST,
10 COUNTRIES,
11 EYE_COLORS,
12 EYE_REF,
13 GENERAL_COLORS,
14 HAIR_ADJS,
15 HAIR_COLORS,
16 HAIR_REF,
17 LANGUAGES,
18 NATIONALITIES,
19 PERSON_REF,
20 RACE_ABBREV,
21 RACE_FEATURES,
22 RACE_WORDS,
23 SENSITIVE_HAIR_REF,
24 SKIN_COLORS,
25)
26from .officer import OfficerName
27from .person import PersonName, _name_match
28from .re_util import re_literal_group
29from .source_text import SourceText
30from .text_processing import get_officers_from_narrative, get_persons_from_narrative
32AnyPerson = Union[OfficerName, PersonName]
35# TODO(jnu): rewrite to generalize common behaviors. Really we only have three
36# approaches: using PersonNames, using RegEx, and using NER. Generalize these
37# as first-class rules that can be parameterized and applied.
40def _re_literal_adj_list(adjectives: Iterable[str]) -> str:
41 """
42 Create a RegExp pattern for matching a list of adjectives with literals.
44 :param adjectives: List of adjective literals
45 :returns: RegExp pattern
46 """
48 adj_group = re_literal_group(adjectives)
49 conj_group = re_literal_group(["and", "or"], capture=False)
50 conj_sym_group = re_literal_group(["&", "/"], capture=False)
51 det_group = re_literal_group(["a", "an", "the", "some", "any"], capture=False)
53 return (
54 # fmt: off
55 r"\b{adj}(?:\s+,?\s*{adj},?)*"
56 r"(?:(?:\s+{cnj}\s+|\s*{cnj_sym}\s*)(?:{det}\s+)?{adj}(?:\s+,?\s*{adj},?)*)?\b"
57 # fmt: on
58 ).format(adj=adj_group, cnj=conj_group, cnj_sym=conj_sym_group, det=det_group)
61def _re_literal_noun_phrase(adjectives: Iterable[str], nouns: Iterable[str]) -> str:
62 """Create a RegExp pattern for matching a simple noun phrase with literals.
64 Example:
65 pattern = f(["green", "black"], ["frog", "toad"])
67 This pattern would match "green frog" and "black toad" and even
68 "green and black toad."
70 :param adjectives: List of adjective literals
71 :param noun: List of noun literals
72 :returns: RegExp pattern
73 """
74 adj_list = _re_literal_adj_list(adjectives)
75 noun_group = re_literal_group(nouns, name="noun")
77 return r"{adj}\s+{n}\b".format(adj=adj_list, n=noun_group)
80def _redact_entities(
81 doc: SourceText, literals: Iterable[str], placeholder: str, info: str = ""
82) -> Generator[Redaction, None, None]:
83 """Redact NLP entities matching the given list.
85 :param doc: Source text
86 :param literals: List of literal strings to match
87 :param placeholder: String to use in lieu of matched entities
88 :param info: Comment to pass to redaction for tracing
89 :yields: Redactions
90 """
91 search_names = re_literal_group(literals, capture=False)
92 # matches search names lazily to allow for longest search name match
93 search_pattern = r"(.*?\s+)??\b{}\b(\s+.*)?".format(search_names)
94 search_re = re.compile(search_pattern, re.IGNORECASE)
96 for ent in doc.nlp.ents[::-1]:
97 if not doc.can_redact(ent.start_char, ent.end_char):
98 continue
99 m = search_re.match(ent.text)
100 if m:
101 start = ent.start_char
102 end = ent.end_char
103 pfx = m.group(1) or ""
104 sfx = m.group(2) or ""
105 replacement = "{}[{}]{}".format(pfx, placeholder, sfx)
106 yield doc.redact(start, end, replacement, info=info)
109def _redact_words(
110 doc: SourceText, literals: Iterable[str], placeholder: str, info: str = ""
111) -> Generator[Redaction, None, None]:
112 """Redact words as tokenized by NLP.
114 :param doc: Source text
115 :param literals: List of literal strings to match
116 :param placeholder: String to use in lieu of matching words
117 :param info: Comment to pass to redaction for tracing
118 :yields: Redaction
119 """
120 candidates = set(literals)
121 replacement = "[{}]".format(placeholder)
123 for word in list(doc.nlp)[::-1]:
124 start_char = word.idx
125 end_char = start_char + len(word)
126 if not doc.can_redact(start_char, end_char):
127 continue
128 if word.text in candidates:
129 yield doc.redact(start_char, end_char, replacement, info=info)
132def mask_skin_color(
133 doc: SourceText, placeholder: str = "race/ethnicity"
134) -> Generator[Redaction, None, None]:
135 """Generate redactions for words used to describe skin color.
137 E.g., "black person" -> "[race/ethnicity] person"
139 NOTE: There may be overlap here with rules that deal with ethnicity
140 directly.
142 :param doc: Source text
143 :param placeholder: String to use in lieu of skin color words.
144 :yields: Redactions
145 """
146 pattern = _re_literal_noun_phrase(SKIN_COLORS | RACE_WORDS, PERSON_REF)
147 skin_color_re = re.compile(pattern, re.IGNORECASE)
149 for match in skin_color_re.finditer(doc.text):
150 start, end = match.span()
151 replacement = "[{}] {}".format(placeholder, match.group("noun"))
152 yield doc.redact(start, end, replacement, info="skin color")
155def mask_hair_color(
156 doc: SourceText, placeholder: str = "color"
157) -> Generator[Redaction, None, None]:
158 """Generate redactions for hair color.
160 E.g., "red hair" -> "[color] hair"
162 :param doc: Source text
163 :param placeholder: String to use in lieu of color word
164 :yields: Redactions
165 """
166 hair_colors = GENERAL_COLORS | HAIR_COLORS
167 pattern = _re_literal_noun_phrase(hair_colors, HAIR_REF)
168 hair_color_re = re.compile(pattern, re.IGNORECASE)
170 for match in hair_color_re.finditer(doc.text):
171 start, end = match.span()
172 replacement = "[{}] {}".format(placeholder, match.group("noun"))
173 yield doc.redact(start, end, replacement, info="hair color")
176def mask_hair_style(
177 doc: SourceText, placeholder: str = "hairstyle"
178) -> Generator[Redaction, None, None]:
179 """Generate redactions for hair styles.
181 E.g., "black short afro hair" -> "[hairstyle] hair"
183 :param doc: Source text
184 :param placeholder: String to use in lieu of hair style
185 :yields: Redaction
186 """
187 hairstyle_adjs = SENSITIVE_HAIR_REF | HAIR_ADJS | GENERAL_COLORS | HAIR_COLORS
188 hair_nouns = SENSITIVE_HAIR_REF | HAIR_REF
189 replacement = "[{}] hair".format(placeholder)
191 for pattern in [
192 _re_literal_noun_phrase(hairstyle_adjs, hair_nouns),
193 re_literal_group(SENSITIVE_HAIR_REF),
194 ]:
195 hairstyle_re = re.compile(pattern, re.IGNORECASE)
196 for match in hairstyle_re.finditer(doc.text):
197 start, end = match.span()
198 yield doc.redact(start, end, replacement, info="hair style")
201def mask_eye_color(
202 doc: SourceText, placeholder: str = "color"
203) -> Generator[Redaction, None, None]:
204 """Generate redactions for eye color.
206 E.g., "blue eyes" -> "[color] eyes"
208 :param doc: Source text
209 :param placeholder: String to use in lieu of color word
210 :yields: Redactions
211 """
212 eye_colors = GENERAL_COLORS | EYE_COLORS
213 pattern = _re_literal_noun_phrase(eye_colors, EYE_REF)
214 eye_color_re = re.compile(pattern, re.IGNORECASE)
216 for match in eye_color_re.finditer(doc.text):
217 start, end = match.span()
218 replacement = "[{}] {}".format(placeholder, match.group("noun"))
219 yield doc.redact(start, end, replacement, info="eye color")
222def mask_country(
223 doc: SourceText, placeholder: str = "country"
224) -> Generator[Redaction, None, None]:
225 """Generate redactions for country names.
227 E.g., "Burundi" -> "[country]"
229 :param doc: Source text
230 :param placeholder: String to use in lieu of country name
231 :yields: Redactions
232 """
233 yield from _redact_entities(doc, COUNTRIES, placeholder, info="country")
234 yield from _redact_words(doc, COUNTRIES, placeholder, info="country")
237def mask_language(
238 doc: SourceText, placeholder: str = "language"
239) -> Generator[Redaction, None, None]:
240 """Generate redactions for nationalities.
242 E.g., "Spanish" -> "[language]"
244 :param doc: Source text
245 :param placeholder: String to use in lieu of language
246 :yields: Redactions
247 """
248 yield from _redact_entities(doc, LANGUAGES, placeholder, info="language")
249 yield from _redact_words(doc, LANGUAGES, placeholder, info="language")
252def mask_nationality(
253 doc: SourceText, placeholder: str = "nationality/ethnicity"
254) -> Generator[Redaction, None, None]:
255 """Generate redactions for nationalities.
257 E.g., "Mexican" -> "[nationality/ethnicity]"
259 :param doc: Source text
260 :param placeholder: String to use in lieu of nationality
261 :yields: Redactions
262 """
263 # NOTE(acw): Tried using spacy's NER classifier alone here, but it would
264 # too often classify irrelevant words (e.g., "5/18/2019" or "Silver Honda")
265 # as languages or locations.
266 yield from _redact_entities(doc, NATIONALITIES, placeholder, info="nationality")
267 yield from _redact_words(doc, NATIONALITIES, placeholder, info="nationality")
270def mask_race(
271 doc: SourceText, placeholder: str = "race/ethnicity"
272) -> Generator[Redaction, None, None]:
273 """Generate redactions for words that directly indicate race.
275 E.g., "African American" -> "[race/ethnicity]"
277 :param doc: Source text
278 :param placeholder: String to use in lieu of race/ethnicity
279 :yields: Redactions
280 """
281 pattern = _re_literal_adj_list(RACE_WORDS)
282 race_re = re.compile(pattern, re.IGNORECASE)
283 replacement = "[{}]".format(placeholder)
285 for match in race_re.finditer(doc.text):
286 start, end = match.span()
287 yield doc.redact(start, end, replacement, info="race")
290def mask_other_literals(
291 doc: SourceText,
292 literals: dict[str, list[str]] | None,
293) -> Generator[Redaction, None, None]:
294 """Generate redactions based on custom lists of literal words.
296 Example:
297 literals = {
298 "district": ["lake district", "park district"],
299 }
301 "The suspect was last seen the Park District" ->
302 "The suspect was last seen in the [district]"
304 :param doc: Source text
305 :param literals: Dictionary describing literal words to redact. Keys will
306 be used to substitute for each of the values in the associated list.
307 :yields: Redactions
308 """
309 if literals is None:
310 return
312 for literal, values in literals.items():
313 pattern = re_literal_group(values)
314 literal_re = re.compile(pattern, re.IGNORECASE)
315 replacement = "[{}]".format(literal)
317 for match in literal_re.finditer(doc.text):
318 start, end = match.span()
319 yield doc.redact(start, end, replacement, info=literal)
322def mask_race_correlated_feature(
323 doc: SourceText, placeholder: str = "physical description"
324) -> Generator[Redaction, None, None]:
325 """Generate redactions for feature that are highly correlated with race
326 without context.
328 E.g., "We saw a blonde" -> "We saw a [physical description]"
330 :param doc: Source text
331 :param placeholder: String to use in lieu of race-correlated features
332 :yields: Redactions
333 """
334 feature_group = re_literal_group(RACE_FEATURES)
335 pattern = r"\b{}\b".format(feature_group)
336 feature_re = re.compile(pattern, re.IGNORECASE)
337 replacement = "[{}]".format(placeholder)
339 for match in feature_re.finditer(doc.text):
340 start, end = match.span()
341 yield doc.redact(start, end, replacement, info="race")
344def mask_race_abbrev(
345 doc: SourceText, placeholder: str = "race/ethnicity"
346) -> Generator[Redaction, None, None]:
347 """Generate redactions for abbreviated words that directly indicate race.
349 E.g., "AMA" -> "[race/ethnicity] male adult"
351 :param doc: Source text
352 :param placeholder: String to use in lieu of race/ethnicity
353 :yields: Redactions
354 """
355 race_group = RACE_ABBREV
356 pattern = r"(?<=\b){}s?(?=\b)".format(race_group)
357 race_re = re.compile(pattern) # dont ignore case
359 sex_dict = {"F": "female", "M": "male"}
360 age_dict = {"A": "adult", "J": "juvenile"}
362 for match in race_re.finditer(doc.text):
363 start, end = match.span()
364 # insert female/male, adult/juvenile depending on 2nd and 3rd groups
365 replacement = "[{}] {} {}".format(
366 placeholder, sex_dict.get(match.group(2)), age_dict.get(match.group(3))
367 )
368 yield doc.redact(start, end, replacement, info="race")
371def mask_appearance_list(
372 doc: SourceText, placeholder: str = "color"
373) -> Generator[Redaction, None, None]:
374 """Generate redactions for words in list format that directly indicate race.
376 E.g., "Race: Hispanic" -> "Race: [race/ethnicity]"
377 E.g., "Hair: Black" -> "Hair: [color]"
379 :param doc: Source text
380 :param placeholder: String to use in lieu of feature
381 :yields: Redactions
382 """
383 color_group = _re_literal_adj_list(
384 SKIN_COLORS | HAIR_COLORS | HAIR_ADJS | EYE_COLORS | GENERAL_COLORS
385 )
386 appearance_group = re_literal_group(APPEARANCE_LIST, name="noun")
387 pattern = r"{}:\s*{}".format(appearance_group, color_group)
388 appearance_list_re = re.compile(pattern, re.IGNORECASE)
390 for match in appearance_list_re.finditer(doc.text):
391 if match.group("noun").lower() in ["race", "complexion"]:
392 placeholder = "race/ethnicity"
393 info = "race"
394 elif match.group("noun") == "eyes":
395 info = "eye color"
396 elif match.group("noun") == "hair":
397 info = "hair color"
398 else:
399 info = "appearance list"
401 start, end = match.span()
402 replacement = "{}: [{}]".format(match.group("noun"), placeholder)
403 yield doc.redact(start, end, replacement, info=info)
406def mask_street_address(
407 doc: SourceText, placeholder: str = "location"
408) -> Generator[Redaction, None, None]:
409 """Generate redactions for street addresses.
411 E.g., "123 Maple St." -> "[location] St."
413 :param doc: Source text
414 :param placeholder: Text to use in lieu of literal street address
415 :yields: Redactions
416 """
417 endings_group = re_literal_group(USPS_STREET_ABBR)
418 street_addr_re = re.compile(
419 r"(?:\d{1,5} [\w\s]{1,20}) (" + endings_group + r"\.?)\W?(?=\s|$)",
420 re.IGNORECASE,
421 )
423 # Avoid matching false street locations:
424 # e.g. 30 mph, #2 lane
425 bad_patterns_re = re.compile(
426 r"\d{1,3}\s?mph\b|\b#?\d\s?([nesw]/?b\s?)?lane\b", # speed | lane in road
427 re.IGNORECASE,
428 )
430 for match in street_addr_re.finditer(doc.text):
431 matched_text = match.group(0)
432 if bad_patterns_re.search(matched_text):
433 continue
435 start, end = match.span()
436 replacement = "[{}] {}".format(placeholder, match.group(1))
437 yield doc.redact(start, end, replacement, info="street address")
440def mask_district(
441 doc: SourceText, locale: Locale, placeholder: str = "district"
442) -> Generator[Redaction, None, None]:
443 """Generate redactions for police precincts.
445 :param doc: Source text
446 :param locale: Locale to use for masking
447 :param placeholder: Text to use in lieu of literal district name
448 :yields: Redactions
449 """
450 for match in locale.match_district(doc.text):
451 start, end = match.span()
452 sfx = (match.group(2) or "").lower()
453 # Avoid adding suffix if it'd be awkwardly redundant, as in the case
454 # of "[district] district"
455 sfx = "" if sfx == placeholder else sfx
456 replacement = "[{}]".format(placeholder)
457 if sfx:
458 replacement += " " + sfx
459 yield doc.redact(start, end, replacement, info="district name")
462def mask_presumed_street_name(
463 doc: SourceText, placeholder: str = "street"
464) -> Generator[Redaction, None, None]:
465 """Generate redactions for entities that look like street names.
467 E.g., "Maple St." -> "[street] St."
469 :param doc: Source text
470 :param placeholder: Text to use in lieu of street name
471 :yields: Redactions
472 """
473 ending_variants = sum(
474 [[abbr, abbr.capitalize(), abbr.upper()] for abbr in USPS_STREET_ABBR],
475 list[str](),
476 )
477 street_endings = re_literal_group(ending_variants, capture=False)
478 street_name_pattern = (
479 r"(?:(?:\d+|[A-Z])[A-Za-z\']*\s+)+"
480 + r"(%s\.?)" % street_endings
481 + r"(?=[,\/#!$%\^&\*;:{}=\-_`~()\s])"
482 )
483 # Last pattern matches any `\b` except `\.` (matched in second pattern)
484 # This keeps the period (e.g. in "St.") in the placeholder
485 # NOTE(jnu): this is not case insensitive; the point is to use the
486 # capitalization structure to infer words that might constitute a street
487 # name.
488 street_name_re = re.compile(street_name_pattern)
490 # Avoid matching false street names:
491 # e.g. EB lane, E/B lane, #2 lane (on the freeway)
492 bad_patterns_re = re.compile(r"\b(#?\d\s)?([nesw]/?b\s?)?lane\b", re.IGNORECASE)
494 for match in street_name_re.finditer(doc.text):
495 matched_text = match.group(0)
496 if bad_patterns_re.search(matched_text):
497 continue
499 start, end = match.span()
500 replacement = "[{}] {}".format(placeholder, match.group(1))
501 yield doc.redact(start, end, replacement, info="presumed street name")
504def mask_known_street_name(
505 doc: SourceText, locale: Locale, placeholder: str = "street"
506) -> Generator[Redaction, None, None]:
507 """Generate redactions for known streets in the city.
509 E.g., "Arguello and Euclid" -> "[street] and [street]"
511 :param doc: Source text
512 :param locale: Locale to use for masking
513 :param placeholder: Text to use in lieu of street name
514 :yields: Redactions
515 """
516 for match in locale.match_street_name(doc.text):
517 start, end = match.span()
518 replacement = "[{placeholder}]{conj}[{placeholder}]".format(
519 placeholder=placeholder, conj=match.group("conj")
520 )
521 yield doc.redact(start, end, replacement, info="known street name")
524def mask_neighborhood(
525 doc: SourceText, locale: Locale, placeholder: str = "neighborhood"
526) -> Generator[Redaction, None, None]:
527 """Generate redactions for neighborhoods in the city.
529 E.g., "Parkside" -> "[neighborhood]"
531 :param doc: Source text
532 :param locale: Locale to use to perform masking
533 :param placeholder: Text to use in lieu of neighborhood name
534 :yields: Redactions
535 """
536 # TODO(jnu): improve Locale API for matching these
537 yield from _redact_entities(
538 doc, locale.neighborhoods, placeholder, info="neighborhood"
539 )
542def _create_person_name_map(persons: Iterable[AnyPerson]) -> Dict[str, Set[AnyPerson]]:
543 """Create a map from surface name representations to persons.
545 The map connects the surface representations of a human name (such as
546 "John P. Smith") to the PersonName instances that this name could refer to.
547 In most cases this should be unique, however there may be ambiguous cases
548 such as "J. Smith" that might refer to multiple individuals.
550 :param persons: List of person references
551 :returns: Map from names to person references
552 """
553 m = DefaultDict[str, Set[AnyPerson]](set)
555 for p in persons:
556 for s in p.name_rep():
557 m[s].add(p)
559 return dict(m)
562def mask_person(
563 doc: SourceText,
564 persons: Iterable[AnyPerson],
565 info: str,
566) -> Generator[Redaction, None, None]:
567 """Generate a list of redactions for the persons given in the input.
569 :param doc: Source text
570 :param persons: List of person references to redact
571 :param annotations: List of existing annotations (passed to avoid adding
572 conflicting annotations on a range)
573 :yields: Redaction instances
574 """
575 person_signs = _create_person_name_map(persons)
577 # Process surface representations of names in order of longest to shortest.
578 # This means the longest names will be replaced first, which should help to
579 # avoid ambiguity.
580 sorted_signs = sorted(person_signs.items(), key=lambda x: len(x[0]), reverse=True)
582 for signifier, signified in sorted_signs:
583 # Ambiguous references:
584 pattern = re.compile(signifier, re.IGNORECASE)
585 ordered_signified = sorted(signified, key=lambda a: a.get_indicator())
586 if info == "officer":
587 # replacement as "Officer #1 or Officer #2"
588 codename = " or ".join([p.get_indicator() for p in ordered_signified])
589 elif info == "person":
590 # replacement as "(PERSON_1 or PERSON_2)"" rather than "(PERSON_1) or (PERSON_2)""
591 codename = "(%s)" % " or ".join(
592 [re.sub(r"[\(\)]", "", p.get_indicator()) for p in ordered_signified]
593 )
595 for match in pattern.finditer(doc.text):
596 replacement = codename
597 start, end = match.span()
598 # Special case: the rare terminal-apostrophe possessive, such as
599 # "Moses'" where the correct redaction synthetically adds the 's.
600 # TODO(jnu): probably better to handle this where we handle the
601 # indefinite article redaction, in SourceText.
602 if doc.text[end : end + 2] == "' ":
603 replacement = codename + "'s"
604 end += 1
606 # TODO(jnu): clean up coloring and classing
607 ordered_signified[0]
608 yield doc.redact(
609 start,
610 end,
611 replacement,
612 auto_capitalize=False,
613 autocorrect_article=False,
614 info=info,
615 )
618def mask_person_fuzzy(
619 doc: SourceText,
620 persons: Iterable[PersonName],
621 info: str,
622) -> Generator[Redaction, None, None]:
623 """Generate a list of redactions for the persons given in the input
624 by redacting proper nouns in the text which are similar to last names in
625 persons.
627 :param doc: Source text
628 :param persons: List of person references to redact
629 :param annotations: List of existing annotations (passed to avoid adding
630 conflicting annotations on a range)
631 :yields: Redaction instances
632 """
634 min_character_limit = 5
635 propn_tokens = {
636 token
637 for token in doc.nlp
638 if token.pos_ == "PROPN" and len(token) > min_character_limit
639 }
641 for token in propn_tokens:
642 start_char = token.idx
643 end_char = start_char + len(token)
645 if not doc.can_redact(start_char, end_char):
646 continue
647 else:
648 valid_persons = [
649 person
650 for person in persons
651 if _name_match({f"{person.first} {person.last}"}, {token.text.upper()})
652 or _name_match(person.last, {token.text.upper()}, 1)
653 or _name_match(person.first, {token.text.upper()}, 1)
654 ]
656 if valid_persons:
657 replacement = "(%s)" % " or ".join(
658 [
659 re.sub(r"[\(\)]", "", person.get_indicator())
660 for person in valid_persons
661 ]
662 )
663 yield doc.redact(
664 start_char,
665 end_char,
666 replacement,
667 auto_capitalize=False,
668 autocorrect_article=False,
669 info=info,
670 )
673def mask(
674 locale: Locale,
675 narrative: str,
676 persons: Iterable[PersonName],
677 officers: Iterable[OfficerName],
678 literals: dict[str, list[str]] | None = None,
679) -> List[Redaction]:
680 """Apply masking and formatting to narrative text.
682 :param narrative: Incident report text
683 :param persons: List of names of people appearing in text
684 :param OfficerName: List of names of officers appearing in text
685 :param literals: Optional dictionary of custom lists to extend redaction
686 :returns: List of redactions
687 """
688 doc = SourceText(narrative)
690 return list(
691 itertools.chain(
692 mask_person(doc, officers, "officer"),
693 mask_person(doc, persons, "person"),
694 mask_street_address(doc),
695 mask_district(doc, locale),
696 mask_known_street_name(doc, locale),
697 mask_presumed_street_name(doc),
698 mask_neighborhood(doc, locale),
699 mask_skin_color(doc),
700 mask_hair_style(doc),
701 mask_hair_color(doc),
702 mask_eye_color(doc),
703 mask_appearance_list(doc),
704 mask_race_abbrev(doc),
705 mask_race(doc),
706 mask_race_correlated_feature(doc),
707 mask_country(doc),
708 mask_language(doc),
709 mask_nationality(doc),
710 mask_person_fuzzy(doc, persons, "person"),
711 mask_other_literals(doc, literals),
712 )
713 )
716def merge_annotations(annotations, narrative: str) -> List[Redaction]:
717 """Merge 'person' annotations that contain the same text and info
718 if they are only separated by a single white space
720 e.g. "(S1) (S1)" -> "(S1)"
721 :param annotations: unsorted list of annotations
722 :param narrative: Incident report text
723 :returns: reverse sorted list of merged annotations
724 """
725 if not annotations or len(annotations) <= 1:
726 return annotations
728 # order redactions by character number, last to first
729 annotations.sort(key=lambda x: x.start, reverse=True)
731 final_annotations = list[Redaction]()
732 end_annotation = annotations[0]
734 for annotation in annotations[1:]:
735 if (
736 end_annotation.start - annotation.end <= 1
737 and end_annotation.text == annotation.text
738 and end_annotation.info == annotation.info
739 and end_annotation.info == "person"
740 and re.match(r"\s", narrative[annotation.end : end_annotation.start])
741 ):
742 end_annotation.start = annotation.start
743 else:
744 final_annotations.append(end_annotation)
745 end_annotation = annotation
746 final_annotations.append(end_annotation)
748 return final_annotations
751def annotate(
752 locale: Locale,
753 narrative: str,
754 persons: Iterable[dict],
755 officers: Iterable[dict],
756 redact_officers_from_text: bool = True,
757 literals: dict[str, list[str]] | None = None,
758) -> List[Redaction]:
759 """Apply redaction tool and formatting to narrative text.
761 :param locale: location of narrative
762 :param narrative: Incident report text
763 :param persons: List of people appearing in text
764 :param officers: List of officers appearing in text
765 :param redact_officers_from_text: Whether to redact officers from text
766 :param literals: Optional dictionary of custom lists to extend redaction
767 :returns: redaction annotations
768 """
769 person_types = set(locale.indicators.keys())
771 persons = locale.filter_names(persons)
772 formatted_persons = [PersonName(**person) for person in persons]
773 formatted_officers = [OfficerName(**officer) for officer in officers]
775 # get_persons_from_narrative only applicable to sf right now, will refactor later
776 formatted_persons += get_persons_from_narrative(narrative, 0, person_types)
777 if redact_officers_from_text:
778 formatted_officers += get_officers_from_narrative(narrative)
780 formatted_persons = PersonName.dedupe(formatted_persons, locale)
781 formatted_officers = OfficerName.dedupe(formatted_officers, locale)
783 # create redactions
784 annotations = mask(
785 locale,
786 narrative,
787 persons=formatted_persons,
788 officers=formatted_officers,
789 literals=literals,
790 )
791 return merge_annotations(annotations, narrative)