Coverage for blind_charging/masker.py: 90%
235 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-17 20:36 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-17 20:36 +0000
1import itertools
2import re
3from typing import DefaultDict, Dict, Generator, Iterable, List, Set, Union
5from .annotation import Redaction
6from .locale import Locale
7from .locale.const import USPS_STREET_ABBR
8from .mask_const import (
9 APPEARANCE_LIST,
10 COUNTRIES,
11 EYE_COLORS,
12 EYE_REF,
13 GENERAL_COLORS,
14 HAIR_ADJS,
15 HAIR_COLORS,
16 HAIR_REF,
17 LANGUAGES,
18 NATIONALITIES,
19 PERSON_REF,
20 RACE_ABBREV,
21 RACE_FEATURES,
22 RACE_WORDS,
23 SENSITIVE_HAIR_REF,
24 SKIN_COLORS,
25 SLURS,
26)
27from .officer import OfficerName
28from .person import PersonName, _name_match
29from .re_util import re_literal_group
30from .source_text import SourceText
31from .text_processing import get_officers_from_narrative, get_persons_from_narrative
33AnyPerson = Union[OfficerName, PersonName]
36# TODO(jnu): rewrite to generalize common behaviors. Really we only have three
37# approaches: using PersonNames, using RegEx, and using NER. Generalize these
38# as first-class rules that can be parameterized and applied.
41def _re_literal_adj_list(adjectives: Iterable[str]) -> str:
42 """
43 Create a RegExp pattern for matching a list of adjectives with literals.
45 :param adjectives: List of adjective literals
46 :returns: RegExp pattern
47 """
49 adj_group = re_literal_group(adjectives)
50 conj_group = re_literal_group(["and", "or"], capture=False)
51 conj_sym_group = re_literal_group(["&", "/"], capture=False)
52 det_group = re_literal_group(["a", "an", "the", "some", "any"], capture=False)
54 return (
55 # fmt: off
56 r"\b{adj}(?:\s+,?\s*{adj},?)*"
57 r"(?:(?:\s+{cnj}\s+|\s*{cnj_sym}\s*)(?:{det}\s+)?{adj}(?:\s+,?\s*{adj},?)*)?\b"
58 # fmt: on
59 ).format(adj=adj_group, cnj=conj_group, cnj_sym=conj_sym_group, det=det_group)
62def _re_literal_noun_phrase(adjectives: Iterable[str], nouns: Iterable[str]) -> str:
63 """Create a RegExp pattern for matching a simple noun phrase with literals.
65 Example:
66 pattern = f(["green", "black"], ["frog", "toad"])
68 This pattern would match "green frog" and "black toad" and even
69 "green and black toad."
71 :param adjectives: List of adjective literals
72 :param noun: List of noun literals
73 :returns: RegExp pattern
74 """
75 adj_list = _re_literal_adj_list(adjectives)
76 noun_group = re_literal_group(nouns, name="noun")
78 return r"{adj}\s+{n}\b".format(adj=adj_list, n=noun_group)
81def _redact_entities(
82 doc: SourceText, literals: Iterable[str], placeholder: str, info: str = ""
83) -> Generator[Redaction, None, None]:
84 """Redact NLP entities matching the given list.
86 :param doc: Source text
87 :param literals: List of literal strings to match
88 :param placeholder: String to use in lieu of matched entities
89 :param info: Comment to pass to redaction for tracing
90 :yields: Redactions
91 """
92 search_names = re_literal_group(literals, capture=False)
93 # matches search names lazily to allow for longest search name match
94 search_pattern = r"(.*?\s+)??\b{}\b(\s+.*)?".format(search_names)
95 search_re = re.compile(search_pattern, re.IGNORECASE)
97 for ent in doc.nlp.ents[::-1]:
98 if not doc.can_redact(ent.start_char, ent.end_char):
99 continue
100 m = search_re.match(ent.text)
101 if m:
102 start = ent.start_char
103 end = ent.end_char
104 pfx = m.group(1) or ""
105 sfx = m.group(2) or ""
106 replacement = "{}[{}]{}".format(pfx, placeholder, sfx)
107 yield doc.redact(start, end, replacement, info=info)
110def _redact_words(
111 doc: SourceText, literals: Iterable[str], placeholder: str, info: str = ""
112) -> Generator[Redaction, None, None]:
113 """Redact words as tokenized by NLP.
115 :param doc: Source text
116 :param literals: List of literal strings to match
117 :param placeholder: String to use in lieu of matching words
118 :param info: Comment to pass to redaction for tracing
119 :yields: Redaction
120 """
121 candidates = set(literals)
122 replacement = "[{}]".format(placeholder)
124 for word in list(doc.nlp)[::-1]:
125 start_char = word.idx
126 end_char = start_char + len(word)
127 if not doc.can_redact(start_char, end_char):
128 continue
129 if word.text in candidates:
130 yield doc.redact(start_char, end_char, replacement, info=info)
133def mask_skin_color(
134 doc: SourceText, placeholder: str = "race/ethnicity"
135) -> Generator[Redaction, None, None]:
136 """Generate redactions for words used to describe skin color.
138 E.g., "black person" -> "[race/ethnicity] person"
140 NOTE: There may be overlap here with rules that deal with ethnicity
141 directly.
143 :param doc: Source text
144 :param placeholder: String to use in lieu of skin color words.
145 :yields: Redactions
146 """
147 pattern = _re_literal_noun_phrase(SKIN_COLORS | RACE_WORDS, PERSON_REF)
148 skin_color_re = re.compile(pattern, re.IGNORECASE)
150 for match in skin_color_re.finditer(doc.text):
151 start, end = match.span()
152 replacement = "[{}] {}".format(placeholder, match.group("noun"))
153 yield doc.redact(start, end, replacement, info="skin color")
156def mask_hair_color(
157 doc: SourceText, placeholder: str = "color"
158) -> Generator[Redaction, None, None]:
159 """Generate redactions for hair color.
161 E.g., "red hair" -> "[color] hair"
163 :param doc: Source text
164 :param placeholder: String to use in lieu of color word
165 :yields: Redactions
166 """
167 hair_colors = GENERAL_COLORS | HAIR_COLORS
168 pattern = _re_literal_noun_phrase(hair_colors, HAIR_REF)
169 hair_color_re = re.compile(pattern, re.IGNORECASE)
171 for match in hair_color_re.finditer(doc.text):
172 start, end = match.span()
173 replacement = "[{}] {}".format(placeholder, match.group("noun"))
174 yield doc.redact(start, end, replacement, info="hair color")
177def mask_hair_style(
178 doc: SourceText, placeholder: str = "hairstyle"
179) -> Generator[Redaction, None, None]:
180 """Generate redactions for hair styles.
182 E.g., "black short afro hair" -> "[hairstyle] hair"
184 :param doc: Source text
185 :param placeholder: String to use in lieu of hair style
186 :yields: Redaction
187 """
188 hairstyle_adjs = SENSITIVE_HAIR_REF | HAIR_ADJS | GENERAL_COLORS | HAIR_COLORS
189 hair_nouns = SENSITIVE_HAIR_REF | HAIR_REF
190 replacement = "[{}] hair".format(placeholder)
192 for pattern in [
193 _re_literal_noun_phrase(hairstyle_adjs, hair_nouns),
194 re_literal_group(SENSITIVE_HAIR_REF),
195 ]:
196 hairstyle_re = re.compile(pattern, re.IGNORECASE)
197 for match in hairstyle_re.finditer(doc.text):
198 start, end = match.span()
199 yield doc.redact(start, end, replacement, info="hair style")
202def mask_eye_color(
203 doc: SourceText, placeholder: str = "color"
204) -> Generator[Redaction, None, None]:
205 """Generate redactions for eye color.
207 E.g., "blue eyes" -> "[color] eyes"
209 :param doc: Source text
210 :param placeholder: String to use in lieu of color word
211 :yields: Redactions
212 """
213 eye_colors = GENERAL_COLORS | EYE_COLORS
214 pattern = _re_literal_noun_phrase(eye_colors, EYE_REF)
215 eye_color_re = re.compile(pattern, re.IGNORECASE)
217 for match in eye_color_re.finditer(doc.text):
218 start, end = match.span()
219 replacement = "[{}] {}".format(placeholder, match.group("noun"))
220 yield doc.redact(start, end, replacement, info="eye color")
223def mask_country(
224 doc: SourceText, placeholder: str = "country"
225) -> Generator[Redaction, None, None]:
226 """Generate redactions for country names.
228 E.g., "Burundi" -> "[country]"
230 :param doc: Source text
231 :param placeholder: String to use in lieu of country name
232 :yields: Redactions
233 """
234 yield from _redact_entities(doc, COUNTRIES, placeholder, info="country")
235 yield from _redact_words(doc, COUNTRIES, placeholder, info="country")
238def mask_language(
239 doc: SourceText, placeholder: str = "language"
240) -> Generator[Redaction, None, None]:
241 """Generate redactions for nationalities.
243 E.g., "Spanish" -> "[language]"
245 :param doc: Source text
246 :param placeholder: String to use in lieu of language
247 :yields: Redactions
248 """
249 yield from _redact_entities(doc, LANGUAGES, placeholder, info="language")
250 yield from _redact_words(doc, LANGUAGES, placeholder, info="language")
253def mask_nationality(
254 doc: SourceText, placeholder: str = "nationality/ethnicity"
255) -> Generator[Redaction, None, None]:
256 """Generate redactions for nationalities.
258 E.g., "Mexican" -> "[nationality/ethnicity]"
260 :param doc: Source text
261 :param placeholder: String to use in lieu of nationality
262 :yields: Redactions
263 """
264 # NOTE(acw): Tried using spacy's NER classifier alone here, but it would
265 # too often classify irrelevant words (e.g., "5/18/2019" or "Silver Honda")
266 # as languages or locations.
267 yield from _redact_entities(doc, NATIONALITIES, placeholder, info="nationality")
268 yield from _redact_words(doc, NATIONALITIES, placeholder, info="nationality")
271def mask_race(
272 doc: SourceText, placeholder: str = "race/ethnicity"
273) -> Generator[Redaction, None, None]:
274 """Generate redactions for words that directly indicate race.
276 E.g., "African American" -> "[race/ethnicity]"
278 :param doc: Source text
279 :param placeholder: String to use in lieu of race/ethnicity
280 :yields: Redactions
281 """
282 pattern = _re_literal_adj_list(RACE_WORDS | SLURS)
283 race_re = re.compile(pattern, re.IGNORECASE)
284 replacement = "[{}]".format(placeholder)
286 for match in race_re.finditer(doc.text):
287 start, end = match.span()
288 yield doc.redact(start, end, replacement, info="race")
291def mask_race_correlated_feature(
292 doc: SourceText, placeholder: str = "physical description"
293) -> Generator[Redaction, None, None]:
294 """Generate redactions for feature that are highly correlated with race
295 without context.
297 E.g., "We saw a blonde" -> "We saw a [physical description]"
299 :param doc: Source text
300 :param placeholder: String to use in lieu of race-correlated features
301 :yields: Redactions
302 """
303 feature_group = re_literal_group(RACE_FEATURES)
304 pattern = r"\b{}\b".format(feature_group)
305 feature_re = re.compile(pattern, re.IGNORECASE)
306 replacement = "[{}]".format(placeholder)
308 for match in feature_re.finditer(doc.text):
309 start, end = match.span()
310 yield doc.redact(start, end, replacement, info="race")
313def mask_race_abbrev(
314 doc: SourceText, placeholder: str = "race/ethnicity"
315) -> Generator[Redaction, None, None]:
316 """Generate redactions for abbreviated words that directly indicate race.
318 E.g., "AMA" -> "[race/ethnicity] male adult"
320 :param doc: Source text
321 :param placeholder: String to use in lieu of race/ethnicity
322 :yields: Redactions
323 """
324 race_group = RACE_ABBREV
325 pattern = r"(?<=\b){}s?(?=\b)".format(race_group)
326 race_re = re.compile(pattern) # dont ignore case
328 sex_dict = {"F": "female", "M": "male"}
329 age_dict = {"A": "adult", "J": "juvenile"}
331 for match in race_re.finditer(doc.text):
332 start, end = match.span()
333 # insert female/male, adult/juvenile depending on 2nd and 3rd groups
334 replacement = "[{}] {} {}".format(
335 placeholder, sex_dict.get(match.group(2)), age_dict.get(match.group(3))
336 )
337 yield doc.redact(start, end, replacement, info="race")
340def mask_appearance_list(
341 doc: SourceText, placeholder: str = "color"
342) -> Generator[Redaction, None, None]:
343 """Generate redactions for words in list format that directly indicate race.
345 E.g., "Race: Hispanic" -> "Race: [race/ethnicity]"
346 E.g., "Hair: Black" -> "Hair: [color]"
348 :param doc: Source text
349 :param placeholder: String to use in lieu of feature
350 :yields: Redactions
351 """
352 color_group = _re_literal_adj_list(
353 SKIN_COLORS | HAIR_COLORS | HAIR_ADJS | EYE_COLORS | GENERAL_COLORS
354 )
355 appearance_group = re_literal_group(APPEARANCE_LIST, name="noun")
356 pattern = r"{}:\s*{}".format(appearance_group, color_group)
357 appearance_list_re = re.compile(pattern, re.IGNORECASE)
359 for match in appearance_list_re.finditer(doc.text):
360 if match.group("noun").lower() in ["race", "complexion"]:
361 placeholder = "race/ethnicity"
362 info = "race"
363 elif match.group("noun") == "eyes":
364 info = "eye color"
365 elif match.group("noun") == "hair":
366 info = "hair color"
367 else:
368 info = "appearance list"
370 start, end = match.span()
371 replacement = "{}: [{}]".format(match.group("noun"), placeholder)
372 yield doc.redact(start, end, replacement, info=info)
375def mask_street_address(
376 doc: SourceText, placeholder: str = "location"
377) -> Generator[Redaction, None, None]:
378 """Generate redactions for street addresses.
380 E.g., "123 Maple St." -> "[location] St."
382 :param doc: Source text
383 :param placeholder: Text to use in lieu of literal street address
384 :yields: Redactions
385 """
386 endings_group = re_literal_group(USPS_STREET_ABBR)
387 street_addr_re = re.compile(
388 r"(?:\d{1,5} [\w\s]{1,20}) (" + endings_group + r"\.?)\W?(?=\s|$)",
389 re.IGNORECASE,
390 )
392 # Avoid matching false street locations:
393 # e.g. 30 mph, #2 lane
394 bad_patterns_re = re.compile(
395 r"\d{1,3}\s?mph\b|\b#?\d\s?([nesw]/?b\s?)?lane\b", # speed | lane in road
396 re.IGNORECASE,
397 )
399 for match in street_addr_re.finditer(doc.text):
400 matched_text = match.group(0)
401 if bad_patterns_re.search(matched_text):
402 continue
404 start, end = match.span()
405 replacement = "[{}] {}".format(placeholder, match.group(1))
406 yield doc.redact(start, end, replacement, info="street address")
409def mask_district(
410 doc: SourceText, locale: Locale, placeholder: str = "district"
411) -> Generator[Redaction, None, None]:
412 """Generate redactions for police precincts.
414 :param doc: Source text
415 :param locale: Locale to use for masking
416 :param placeholder: Text to use in lieu of literal district name
417 :yields: Redactions
418 """
419 for match in locale.match_district(doc.text):
420 start, end = match.span()
421 sfx = (match.group(2) or "").lower()
422 # Avoid adding suffix if it'd be awkwardly redundant, as in the case
423 # of "[district] district"
424 sfx = "" if sfx == placeholder else sfx
425 replacement = "[{}]".format(placeholder)
426 if sfx:
427 replacement += " " + sfx
428 yield doc.redact(start, end, replacement, info="district name")
431def mask_presumed_street_name(
432 doc: SourceText, placeholder: str = "street"
433) -> Generator[Redaction, None, None]:
434 """Generate redactions for entities that look like street names.
436 E.g., "Maple St." -> "[street] St."
438 :param doc: Source text
439 :param placeholder: Text to use in lieu of street name
440 :yields: Redactions
441 """
442 ending_variants = sum(
443 [[abbr, abbr.capitalize(), abbr.upper()] for abbr in USPS_STREET_ABBR],
444 list[str](),
445 )
446 street_endings = re_literal_group(ending_variants, capture=False)
447 street_name_pattern = (
448 r"(?:(?:\d+|[A-Z])[A-Za-z\']*\s+)+"
449 + r"(%s\.?)" % street_endings
450 + r"(?=[,\/#!$%\^&\*;:{}=\-_`~()\s])"
451 )
452 # Last pattern matches any `\b` except `\.` (matched in second pattern)
453 # This keeps the period (e.g. in "St.") in the placeholder
454 # NOTE(jnu): this is not case insensitive; the point is to use the
455 # capitalization structure to infer words that might constitute a street
456 # name.
457 street_name_re = re.compile(street_name_pattern)
459 # Avoid matching false street names:
460 # e.g. EB lane, E/B lane, #2 lane (on the freeway)
461 bad_patterns_re = re.compile(r"\b(#?\d\s)?([nesw]/?b\s?)?lane\b", re.IGNORECASE)
463 for match in street_name_re.finditer(doc.text):
464 matched_text = match.group(0)
465 if bad_patterns_re.search(matched_text):
466 continue
468 start, end = match.span()
469 replacement = "[{}] {}".format(placeholder, match.group(1))
470 yield doc.redact(start, end, replacement, info="presumed street name")
473def mask_known_street_name(
474 doc: SourceText, locale: Locale, placeholder: str = "street"
475) -> Generator[Redaction, None, None]:
476 """Generate redactions for known streets in the city.
478 E.g., "Arguello and Euclid" -> "[street] and [street]"
480 :param doc: Source text
481 :param locale: Locale to use for masking
482 :param placeholder: Text to use in lieu of street name
483 :yields: Redactions
484 """
485 for match in locale.match_street_name(doc.text):
486 start, end = match.span()
487 replacement = "[{placeholder}]{conj}[{placeholder}]".format(
488 placeholder=placeholder, conj=match.group("conj")
489 )
490 yield doc.redact(start, end, replacement, info="known street name")
493def mask_neighborhood(
494 doc: SourceText, locale: Locale, placeholder: str = "neighborhood"
495) -> Generator[Redaction, None, None]:
496 """Generate redactions for neighborhoods in the city.
498 E.g., "Parkside" -> "[neighborhood]"
500 :param doc: Source text
501 :param locale: Locale to use to perform masking
502 :param placeholder: Text to use in lieu of neighborhood name
503 :yields: Redactions
504 """
505 # TODO(jnu): improve Locale API for matching these
506 yield from _redact_entities(
507 doc, locale.neighborhoods, placeholder, info="neighborhood"
508 )
511def _create_person_name_map(persons: Iterable[AnyPerson]) -> Dict[str, Set[AnyPerson]]:
512 """Create a map from surface name representations to persons.
514 The map connects the surface representations of a human name (such as
515 "John P. Smith") to the PersonName instances that this name could refer to.
516 In most cases this should be unique, however there may be ambiguous cases
517 such as "J. Smith" that might refer to multiple individuals.
519 :param persons: List of person references
520 :returns: Map from names to person references
521 """
522 m = DefaultDict[str, Set[AnyPerson]](set)
524 for p in persons:
525 for s in p.name_rep():
526 m[s].add(p)
528 return dict(m)
531def mask_person(
532 doc: SourceText,
533 persons: Iterable[AnyPerson],
534 info: str,
535) -> Generator[Redaction, None, None]:
536 """Generate a list of redactions for the persons given in the input.
538 :param doc: Source text
539 :param persons: List of person references to redact
540 :param annotations: List of existing annotations (passed to avoid adding
541 conflicting annotations on a range)
542 :yields: Redaction instances
543 """
544 person_signs = _create_person_name_map(persons)
546 # Process surface representations of names in order of longest to shortest.
547 # This means the longest names will be replaced first, which should help to
548 # avoid ambiguity.
549 sorted_signs = sorted(person_signs.items(), key=lambda x: len(x[0]), reverse=True)
551 for signifier, signified in sorted_signs:
552 # Ambiguous references:
553 pattern = re.compile(signifier, re.IGNORECASE)
554 ordered_signified = sorted(signified, key=lambda a: a.get_indicator())
555 if info == "officer":
556 # replacement as "Officer #1 or Officer #2"
557 codename = " or ".join([p.get_indicator() for p in ordered_signified])
558 elif info == "person":
559 # replacement as "(PERSON_1 or PERSON_2)"" rather than "(PERSON_1) or (PERSON_2)""
560 codename = "(%s)" % " or ".join(
561 [re.sub(r"[\(\)]", "", p.get_indicator()) for p in ordered_signified]
562 )
564 for match in pattern.finditer(doc.text):
565 replacement = codename
566 start, end = match.span()
567 # Special case: the rare terminal-apostrophe possessive, such as
568 # "Moses'" where the correct redaction synthetically adds the 's.
569 # TODO(jnu): probably better to handle this where we handle the
570 # indefinite article redaction, in SourceText.
571 if doc.text[end : end + 2] == "' ":
572 replacement = codename + "'s"
573 end += 1
575 # TODO(jnu): clean up coloring and classing
576 ordered_signified[0]
577 yield doc.redact(
578 start,
579 end,
580 replacement,
581 auto_capitalize=False,
582 autocorrect_article=False,
583 info=info,
584 )
587def mask_person_fuzzy(
588 doc: SourceText,
589 persons: Iterable[PersonName],
590 info: str,
591) -> Generator[Redaction, None, None]:
592 """Generate a list of redactions for the persons given in the input
593 by redacting proper nouns in the text which are similar to last names in
594 persons.
596 :param doc: Source text
597 :param persons: List of person references to redact
598 :param annotations: List of existing annotations (passed to avoid adding
599 conflicting annotations on a range)
600 :yields: Redaction instances
601 """
603 min_character_limit = 5
604 propn_tokens = {
605 token
606 for token in doc.nlp
607 if token.pos_ == "PROPN" and len(token) > min_character_limit
608 }
610 for token in propn_tokens:
611 start_char = token.idx
612 end_char = start_char + len(token)
614 if not doc.can_redact(start_char, end_char):
615 continue
616 else:
617 valid_persons = [
618 person
619 for person in persons
620 if _name_match({f"{person.first} {person.last}"}, {token.text.upper()})
621 or _name_match(person.last, {token.text.upper()}, 1)
622 or _name_match(person.first, {token.text.upper()}, 1)
623 ]
625 if valid_persons:
626 replacement = "(%s)" % " or ".join(
627 [
628 re.sub(r"[\(\)]", "", person.get_indicator())
629 for person in valid_persons
630 ]
631 )
632 yield doc.redact(
633 start_char,
634 end_char,
635 replacement,
636 auto_capitalize=False,
637 autocorrect_article=False,
638 info=info,
639 )
642def mask(
643 locale: Locale,
644 narrative: str,
645 persons: Iterable[PersonName],
646 officers: Iterable[OfficerName],
647) -> List[Redaction]:
648 """Apply masking and formatting to narrative text.
650 :param narrative: Incident report text
651 :param persons: List of names of people appearing in text
652 :param OfficerName: List of names of officers appearing in text
653 :returns: List of redactions
654 """
655 doc = SourceText(narrative)
657 return list(
658 itertools.chain(
659 mask_person(doc, officers, "officer"),
660 mask_person(doc, persons, "person"),
661 mask_street_address(doc),
662 mask_district(doc, locale),
663 mask_known_street_name(doc, locale),
664 mask_presumed_street_name(doc),
665 mask_neighborhood(doc, locale),
666 mask_skin_color(doc),
667 mask_hair_style(doc),
668 mask_hair_color(doc),
669 mask_eye_color(doc),
670 mask_appearance_list(doc),
671 mask_race_abbrev(doc),
672 mask_race(doc),
673 mask_race_correlated_feature(doc),
674 mask_country(doc),
675 mask_language(doc),
676 mask_nationality(doc),
677 mask_person_fuzzy(doc, persons, "person"),
678 )
679 )
682def merge_annotations(annotations, narrative: str) -> List[Redaction]:
683 """Merge 'person' annotations that contain the same text and info
684 if they are only separated by a single white space
686 e.g. "(S1) (S1)" -> "(S1)"
687 :param annotations: unsorted list of annotations
688 :param narrative: Incident report text
689 :returns: reverse sorted list of merged annotations
690 """
691 if not annotations or len(annotations) <= 1:
692 return annotations
694 # order redactions by character number, last to first
695 annotations.sort(key=lambda x: x.start, reverse=True)
697 final_annotations = list[Redaction]()
698 end_annotation = annotations[0]
700 for annotation in annotations[1:]:
701 if (
702 end_annotation.start - annotation.end <= 1
703 and end_annotation.text == annotation.text
704 and end_annotation.info == annotation.info
705 and end_annotation.info == "person"
706 and re.match(r"\s", narrative[annotation.end : end_annotation.start])
707 ):
708 end_annotation.start = annotation.start
709 else:
710 final_annotations.append(end_annotation)
711 end_annotation = annotation
712 final_annotations.append(end_annotation)
714 return final_annotations
717def annotate(
718 locale: Locale,
719 narrative: str,
720 persons: Iterable[dict],
721 officers: Iterable[dict],
722 redact_officers_from_text=True,
723) -> List[Redaction]:
724 """Apply redaction tool and formatting to narrative text.
726 :param locale: location of narrative
727 :param narrative: Incident report text
728 :param persons: List of people appearing in text
729 :param officers: List of officers appearing in text
730 :returns: redaction annotations
731 """
732 person_types = set(locale.indicators.keys())
734 persons = locale.filter_names(persons)
735 formatted_persons = [PersonName(**person) for person in persons]
736 formatted_officers = [OfficerName(**officer) for officer in officers]
738 # get_persons_from_narrative only applicable to sf right now, will refactor later
739 formatted_persons += get_persons_from_narrative(narrative, 0, person_types)
740 if redact_officers_from_text:
741 formatted_officers += get_officers_from_narrative(narrative)
743 formatted_persons = PersonName.dedupe(formatted_persons, locale)
744 formatted_officers = OfficerName.dedupe(formatted_officers, locale)
746 # create redactions
747 annotations = mask(
748 locale, narrative, persons=formatted_persons, officers=formatted_officers
749 )
750 return merge_annotations(annotations, narrative)