Coverage for blind_charging/text_processing.py: 97%
88 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-17 20:36 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-17 20:36 +0000
1"""Utilities for understanding the narrative text.
3This module does not do masking per se, but the information gleaned through
4these utilities can be used to inform masking.
5"""
6import re
7from typing import List, Optional, Set
9import unidecode
11from .locale.const import INDICATOR_POS_PREFIX, INDICATOR_POS_SUFFIX
12from .mask_const import NAME_PHRASES
13from .officer import OfficerName
14from .person import PersonName
15from .re_util import re_literal_group
16from .source_text import nlp
19def _add_person_mention(
20 mentions: List[PersonName],
21 indicator: str,
22 report_id: int,
23 name: Optional[str] = None,
24) -> None:
26 text_indicator_regex = re_literal_group(NAME_PHRASES, capture=False)
27 text_indicator_regex = r"\b" + text_indicator_regex + r"\b,?"
28 text_p = re.compile(text_indicator_regex, re.IGNORECASE)
30 if not text_p.match(indicator):
31 mentions.append(PersonName(indicator, report_id, name))
32 elif text_p.match(indicator) and name:
33 mentions.append(PersonName("", report_id, name))
36def preprocess(narrative: str) -> str:
37 """Apply formatting to text to make it easier to mask.
39 :param narrative: Raw text of police narrative
40 :returns: Cleaner text ready for processing
41 """
42 if not narrative:
43 return ""
44 # NOTE(jnu): any alterations made here should be fine to return either as
45 # masked or unmasked text. The resulting text should be treated as the
46 # "true" narrative that we want to present.
47 narrative = unidecode.unidecode(narrative)
48 # TODO(jnu): why is this one necessary?
49 narrative = narrative.replace("¿", "'")
50 return narrative
53def get_persons_from_narrative(
54 narrative: str,
55 report_id: int,
56 person_types: Set[str],
57) -> List[PersonName]:
58 """Infer Persons mentioned in the narrative.
60 Only persons flagged with indicators such as R/V are returned.
62 :param narrative: Narrative text
63 :param report_id: ID of report
64 :param person_types: Known person types listed on this report
65 :returns: List of PersonNames
66 """
67 doc = nlp(narrative)
69 if "R/V" in person_types or "V" in person_types:
70 person_types = person_types.union({"R/V", "V"})
71 if "R/W" in person_types or "W" in person_types:
72 person_types = person_types.union({"R/W", "W"})
73 person_types = {x.replace("/", "/?") for x in person_types}
75 indicator_regex = r"(" + r"|".join(person_types) + r")"
76 indicator_regex = r"(?:^|(?<=\W))\(" + indicator_regex + r"(-|/)?\d{1,2}\)(?=\W)"
78 front_indicator_regex = r"(" + r"|".join(person_types) + r")"
79 front_indicator_regex = (
80 r"(?<=\b)" + front_indicator_regex + r"\d{0,2}(-|/)(?=[a-zA-Z])"
81 )
83 text_indicator_regex = re_literal_group(NAME_PHRASES, capture=False)
84 text_indicator_regex = r"\b" + text_indicator_regex + r"\b,?"
86 # Find involved-person indicator flags
87 p = re.compile(indicator_regex)
88 indicators = sorted(p.finditer(doc.text), key=lambda x: x.start())
90 front_p = re.compile(front_indicator_regex)
91 front_indicators = sorted(front_p.finditer(doc.text), key=lambda x: x.start())
93 text_p = re.compile(text_indicator_regex, re.IGNORECASE)
94 text_indicators = sorted(text_p.finditer(doc.text), key=lambda x: x.start())
96 mentions = list[PersonName]()
98 for indicator_position in [INDICATOR_POS_PREFIX, INDICATOR_POS_SUFFIX]:
99 # Build a map from indicator position to the matched indicator for all
100 # the person entities.
101 if indicator_position is INDICATOR_POS_PREFIX:
102 person_pos = {e.start_char: e for e in doc.ents if e.label_ == "PERSON"}
103 elif indicator_position is INDICATOR_POS_SUFFIX:
104 person_pos = {e.end_char: e for e in doc.ents if e.label_ == "PERSON"}
105 if indicator_position is INDICATOR_POS_PREFIX:
106 indicators += front_indicators
107 indicators += text_indicators
109 for indicator in indicators:
110 # NOTE: The end index is exclusive of any part of the indicator. That
111 # is, the character at that position is outside the indicator token.
112 if indicator_position is INDICATOR_POS_PREFIX:
113 end_idx = indicator.end()
114 elif indicator_position is INDICATOR_POS_SUFFIX:
115 start_idx = indicator.start()
117 # Find person references that come after the indicator and compute
118 # their distance.
119 if indicator_position is INDICATOR_POS_PREFIX:
120 diffs = [
121 (pos - end_idx, ent)
122 for pos, ent in person_pos.items()
123 if pos >= end_idx
124 ]
125 elif indicator_position is INDICATOR_POS_SUFFIX:
126 diffs = [
127 (start_idx - pos, ent)
128 for pos, ent in person_pos.items()
129 if pos <= start_idx
130 ]
132 # If no person comes after this, add just the indicator.
133 if not diffs:
134 _add_person_mention(mentions, indicator.group(), report_id)
135 continue
137 # Take the lowest positive position difference
138 offset, next_person = sorted(diffs, key=lambda pair: pair[0])[0]
139 # Examine the substring between the indicator and the person. Reject
140 # this person if it's not clearly associated with the indicator.
141 if indicator_position is INDICATOR_POS_PREFIX:
142 tween = doc.text[end_idx : end_idx + offset]
143 elif indicator_position is INDICATOR_POS_SUFFIX:
144 tween = doc.text[start_idx - offset : start_idx]
146 # If there's more than just spaces between the tokens, assume the
147 # person is not associated with the indicator, and just add the
148 # indicator.
149 # TODO(jnu): Probably want more sophisticated logic; some punctuation
150 # is probably ok.
151 if tween.strip():
152 _add_person_mention(mentions, indicator.group(), report_id)
153 continue
155 # Check if the name is informative. If not, just add the indicator.
156 name = next_person.text.strip()
157 if "UNKNOWN" in name.upper():
158 _add_person_mention(mentions, indicator.group(), report_id)
159 continue
161 # If we get here, conclude that the name pertains to the indicator.
162 _add_person_mention(mentions, indicator.group(), report_id, name)
164 return mentions
167def get_officers_from_narrative(narrative: str) -> List[OfficerName]:
168 """Extract officer names from the narrative text.
170 :param narrative: Police report text
171 :returns: List of officer names
172 """
173 dgt5_re = OfficerName.dgt5_re
174 t_re = OfficerName.t_re
175 n_re = OfficerName.n_re
176 star_re = OfficerName.star_re
177 officer_regexes = [
178 # (1A23B) (Ofc.) John Doe #1234
179 r"(" + dgt5_re + ")?(" + t_re + ")?(" + n_re + "){1,2}(" + star_re + ")",
180 # (1A23B) Ofc. John Doe (#1234)
181 r"(" + dgt5_re + ")?" + t_re + "(" + n_re + ")+(" + star_re + ")?",
182 # (1A23B) Ofc. (John Doe) #1234
183 r"(" + dgt5_re + ")?" + t_re + "((" + n_re + ")+)?(" + star_re + ")",
184 # 1A23B
185 dgt5_re,
186 ]
187 p = re.compile("(" + ")|(".join(officer_regexes) + ")")
188 mentions = []
189 for m in p.finditer(narrative):
190 mentions.append(OfficerName(m.group()))
192 return mentions