Coverage for blind_charging/text_processing.py: 97%

88 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-17 20:36 +0000

1"""Utilities for understanding the narrative text. 

2 

3This module does not do masking per se, but the information gleaned through 

4these utilities can be used to inform masking. 

5""" 

6import re 

7from typing import List, Optional, Set 

8 

9import unidecode 

10 

11from .locale.const import INDICATOR_POS_PREFIX, INDICATOR_POS_SUFFIX 

12from .mask_const import NAME_PHRASES 

13from .officer import OfficerName 

14from .person import PersonName 

15from .re_util import re_literal_group 

16from .source_text import nlp 

17 

18 

19def _add_person_mention( 

20 mentions: List[PersonName], 

21 indicator: str, 

22 report_id: int, 

23 name: Optional[str] = None, 

24) -> None: 

25 

26 text_indicator_regex = re_literal_group(NAME_PHRASES, capture=False) 

27 text_indicator_regex = r"\b" + text_indicator_regex + r"\b,?" 

28 text_p = re.compile(text_indicator_regex, re.IGNORECASE) 

29 

30 if not text_p.match(indicator): 

31 mentions.append(PersonName(indicator, report_id, name)) 

32 elif text_p.match(indicator) and name: 

33 mentions.append(PersonName("", report_id, name)) 

34 

35 

36def preprocess(narrative: str) -> str: 

37 """Apply formatting to text to make it easier to mask. 

38 

39 :param narrative: Raw text of police narrative 

40 :returns: Cleaner text ready for processing 

41 """ 

42 if not narrative: 

43 return "" 

44 # NOTE(jnu): any alterations made here should be fine to return either as 

45 # masked or unmasked text. The resulting text should be treated as the 

46 # "true" narrative that we want to present. 

47 narrative = unidecode.unidecode(narrative) 

48 # TODO(jnu): why is this one necessary? 

49 narrative = narrative.replace("¿", "'") 

50 return narrative 

51 

52 

53def get_persons_from_narrative( 

54 narrative: str, 

55 report_id: int, 

56 person_types: Set[str], 

57) -> List[PersonName]: 

58 """Infer Persons mentioned in the narrative. 

59 

60 Only persons flagged with indicators such as R/V are returned. 

61 

62 :param narrative: Narrative text 

63 :param report_id: ID of report 

64 :param person_types: Known person types listed on this report 

65 :returns: List of PersonNames 

66 """ 

67 doc = nlp(narrative) 

68 

69 if "R/V" in person_types or "V" in person_types: 

70 person_types = person_types.union({"R/V", "V"}) 

71 if "R/W" in person_types or "W" in person_types: 

72 person_types = person_types.union({"R/W", "W"}) 

73 person_types = {x.replace("/", "/?") for x in person_types} 

74 

75 indicator_regex = r"(" + r"|".join(person_types) + r")" 

76 indicator_regex = r"(?:^|(?<=\W))\(" + indicator_regex + r"(-|/)?\d{1,2}\)(?=\W)" 

77 

78 front_indicator_regex = r"(" + r"|".join(person_types) + r")" 

79 front_indicator_regex = ( 

80 r"(?<=\b)" + front_indicator_regex + r"\d{0,2}(-|/)(?=[a-zA-Z])" 

81 ) 

82 

83 text_indicator_regex = re_literal_group(NAME_PHRASES, capture=False) 

84 text_indicator_regex = r"\b" + text_indicator_regex + r"\b,?" 

85 

86 # Find involved-person indicator flags 

87 p = re.compile(indicator_regex) 

88 indicators = sorted(p.finditer(doc.text), key=lambda x: x.start()) 

89 

90 front_p = re.compile(front_indicator_regex) 

91 front_indicators = sorted(front_p.finditer(doc.text), key=lambda x: x.start()) 

92 

93 text_p = re.compile(text_indicator_regex, re.IGNORECASE) 

94 text_indicators = sorted(text_p.finditer(doc.text), key=lambda x: x.start()) 

95 

96 mentions = list[PersonName]() 

97 

98 for indicator_position in [INDICATOR_POS_PREFIX, INDICATOR_POS_SUFFIX]: 

99 # Build a map from indicator position to the matched indicator for all 

100 # the person entities. 

101 if indicator_position is INDICATOR_POS_PREFIX: 

102 person_pos = {e.start_char: e for e in doc.ents if e.label_ == "PERSON"} 

103 elif indicator_position is INDICATOR_POS_SUFFIX: 

104 person_pos = {e.end_char: e for e in doc.ents if e.label_ == "PERSON"} 

105 if indicator_position is INDICATOR_POS_PREFIX: 

106 indicators += front_indicators 

107 indicators += text_indicators 

108 

109 for indicator in indicators: 

110 # NOTE: The end index is exclusive of any part of the indicator. That 

111 # is, the character at that position is outside the indicator token. 

112 if indicator_position is INDICATOR_POS_PREFIX: 

113 end_idx = indicator.end() 

114 elif indicator_position is INDICATOR_POS_SUFFIX: 

115 start_idx = indicator.start() 

116 

117 # Find person references that come after the indicator and compute 

118 # their distance. 

119 if indicator_position is INDICATOR_POS_PREFIX: 

120 diffs = [ 

121 (pos - end_idx, ent) 

122 for pos, ent in person_pos.items() 

123 if pos >= end_idx 

124 ] 

125 elif indicator_position is INDICATOR_POS_SUFFIX: 

126 diffs = [ 

127 (start_idx - pos, ent) 

128 for pos, ent in person_pos.items() 

129 if pos <= start_idx 

130 ] 

131 

132 # If no person comes after this, add just the indicator. 

133 if not diffs: 

134 _add_person_mention(mentions, indicator.group(), report_id) 

135 continue 

136 

137 # Take the lowest positive position difference 

138 offset, next_person = sorted(diffs, key=lambda pair: pair[0])[0] 

139 # Examine the substring between the indicator and the person. Reject 

140 # this person if it's not clearly associated with the indicator. 

141 if indicator_position is INDICATOR_POS_PREFIX: 

142 tween = doc.text[end_idx : end_idx + offset] 

143 elif indicator_position is INDICATOR_POS_SUFFIX: 

144 tween = doc.text[start_idx - offset : start_idx] 

145 

146 # If there's more than just spaces between the tokens, assume the 

147 # person is not associated with the indicator, and just add the 

148 # indicator. 

149 # TODO(jnu): Probably want more sophisticated logic; some punctuation 

150 # is probably ok. 

151 if tween.strip(): 

152 _add_person_mention(mentions, indicator.group(), report_id) 

153 continue 

154 

155 # Check if the name is informative. If not, just add the indicator. 

156 name = next_person.text.strip() 

157 if "UNKNOWN" in name.upper(): 

158 _add_person_mention(mentions, indicator.group(), report_id) 

159 continue 

160 

161 # If we get here, conclude that the name pertains to the indicator. 

162 _add_person_mention(mentions, indicator.group(), report_id, name) 

163 

164 return mentions 

165 

166 

167def get_officers_from_narrative(narrative: str) -> List[OfficerName]: 

168 """Extract officer names from the narrative text. 

169 

170 :param narrative: Police report text 

171 :returns: List of officer names 

172 """ 

173 dgt5_re = OfficerName.dgt5_re 

174 t_re = OfficerName.t_re 

175 n_re = OfficerName.n_re 

176 star_re = OfficerName.star_re 

177 officer_regexes = [ 

178 # (1A23B) (Ofc.) John Doe #1234 

179 r"(" + dgt5_re + ")?(" + t_re + ")?(" + n_re + "){1,2}(" + star_re + ")", 

180 # (1A23B) Ofc. John Doe (#1234) 

181 r"(" + dgt5_re + ")?" + t_re + "(" + n_re + ")+(" + star_re + ")?", 

182 # (1A23B) Ofc. (John Doe) #1234 

183 r"(" + dgt5_re + ")?" + t_re + "((" + n_re + ")+)?(" + star_re + ")", 

184 # 1A23B 

185 dgt5_re, 

186 ] 

187 p = re.compile("(" + ")|(".join(officer_regexes) + ")") 

188 mentions = [] 

189 for m in p.finditer(narrative): 

190 mentions.append(OfficerName(m.group())) 

191 

192 return mentions