Coverage for blind_charging/text

1"""Utilities for understanding the narrative text.

3This module does not do masking per se, but the information gleaned through

4these utilities can be used to inform masking.

5"""

6import re

7from typing import List, Optional, Set

9import unidecode

11from .locale.const import INDICATOR_POS_PREFIX, INDICATOR_POS_SUFFIX

12from .mask_const import NAME_PHRASES

13from .officer import OfficerName

14from .person import PersonName

15from .re_util import re_literal_group

16from .source_text import nlp

19def _add_person_mention(

20 mentions: List[PersonName],

21 indicator: str,

22 report_id: int,

23 name: Optional[str] = None,

24) -> None:

26 text_indicator_regex = re_literal_group(NAME_PHRASES, capture=False)

27 text_indicator_regex = r"\b" + text_indicator_regex + r"\b,?"

28 text_p = re.compile(text_indicator_regex, re.IGNORECASE)

30 if not text_p.match(indicator):

31 mentions.append(PersonName(indicator, report_id, name))

32 elif text_p.match(indicator) and name:

33 mentions.append(PersonName("", report_id, name))

36def preprocess(narrative: str) -> str:

37 """Apply formatting to text to make it easier to mask.

39 :param narrative: Raw text of police narrative

40 :returns: Cleaner text ready for processing

41 """

42 if not narrative:

43 return ""

44 # NOTE(jnu): any alterations made here should be fine to return either as

45 # masked or unmasked text. The resulting text should be treated as the

46 # "true" narrative that we want to present.

47 narrative = unidecode.unidecode(narrative)

48 # TODO(jnu): why is this one necessary?

49 narrative = narrative.replace("¿", "'")

50 return narrative

53def get_persons_from_narrative(

54 narrative: str,

55 report_id: int,

56 person_types: Set[str],

57) -> List[PersonName]:

58 """Infer Persons mentioned in the narrative.

60 Only persons flagged with indicators such as R/V are returned.

62 :param narrative: Narrative text

63 :param report_id: ID of report

64 :param person_types: Known person types listed on this report

65 :returns: List of PersonNames

66 """

67 doc = nlp(narrative)

69 if "R/V" in person_types or "V" in person_types:

70 person_types = person_types.union({"R/V", "V"})

71 if "R/W" in person_types or "W" in person_types:

72 person_types = person_types.union({"R/W", "W"})

73 person_types = {x.replace("/", "/?") for x in person_types}

75 indicator_regex = r"(" + r"|".join(person_types) + r")"

76 indicator_regex = r"(?:^|(?<=\W))\(" + indicator_regex + r"(-|/)?\d{1,2}\)(?=\W)"

78 front_indicator_regex = r"(" + r"|".join(person_types) + r")"

79 front_indicator_regex = (

80 r"(?<=\b)" + front_indicator_regex + r"\d{0,2}(-|/)(?=[a-zA-Z])"

81 )

83 text_indicator_regex = re_literal_group(NAME_PHRASES, capture=False)

84 text_indicator_regex = r"\b" + text_indicator_regex + r"\b,?"

86 # Find involved-person indicator flags

87 p = re.compile(indicator_regex)

88 indicators = sorted(p.finditer(doc.text), key=lambda x: x.start())

90 front_p = re.compile(front_indicator_regex)

91 front_indicators = sorted(front_p.finditer(doc.text), key=lambda x: x.start())

93 text_p = re.compile(text_indicator_regex, re.IGNORECASE)

94 text_indicators = sorted(text_p.finditer(doc.text), key=lambda x: x.start())

96 mentions = list[PersonName]()

98 for indicator_position in [INDICATOR_POS_PREFIX, INDICATOR_POS_SUFFIX]:

99 # Build a map from indicator position to the matched indicator for all

100 # the person entities.

101 if indicator_position is INDICATOR_POS_PREFIX:

102 person_pos = {e.start_char: e for e in doc.ents if e.label_ == "PERSON"}

103 elif indicator_position is INDICATOR_POS_SUFFIX:

104 person_pos = {e.end_char: e for e in doc.ents if e.label_ == "PERSON"}

105 if indicator_position is INDICATOR_POS_PREFIX:

106 indicators += front_indicators

107 indicators += text_indicators

108

109 for indicator in indicators:

110 # NOTE: The end index is exclusive of any part of the indicator. That

111 # is, the character at that position is outside the indicator token.

112 if indicator_position is INDICATOR_POS_PREFIX:

113 end_idx = indicator.end()

114 elif indicator_position is INDICATOR_POS_SUFFIX:

115 start_idx = indicator.start()

116

117 # Find person references that come after the indicator and compute

118 # their distance.

119 if indicator_position is INDICATOR_POS_PREFIX:

120 diffs = [

121 (pos - end_idx, ent)

122 for pos, ent in person_pos.items()

123 if pos >= end_idx

124 ]

125 elif indicator_position is INDICATOR_POS_SUFFIX:

126 diffs = [

127 (start_idx - pos, ent)

128 for pos, ent in person_pos.items()

129 if pos <= start_idx

130 ]

131

132 # If no person comes after this, add just the indicator.

133 if not diffs:

134 _add_person_mention(mentions, indicator.group(), report_id)

135 continue

136

137 # Take the lowest positive position difference

138 offset, next_person = sorted(diffs, key=lambda pair: pair[0])[0]

139 # Examine the substring between the indicator and the person. Reject

140 # this person if it's not clearly associated with the indicator.

141 if indicator_position is INDICATOR_POS_PREFIX:

142 tween = doc.text[end_idx : end_idx + offset]

143 elif indicator_position is INDICATOR_POS_SUFFIX:

144 tween = doc.text[start_idx - offset : start_idx]

145

146 # If there's more than just spaces between the tokens, assume the

147 # person is not associated with the indicator, and just add the

148 # indicator.

149 # TODO(jnu): Probably want more sophisticated logic; some punctuation

150 # is probably ok.

151 if tween.strip():

152 _add_person_mention(mentions, indicator.group(), report_id)

153 continue

154

155 # Check if the name is informative. If not, just add the indicator.

156 name = next_person.text.strip()

157 if "UNKNOWN" in name.upper():

158 _add_person_mention(mentions, indicator.group(), report_id)

159 continue

160

161 # If we get here, conclude that the name pertains to the indicator.

162 _add_person_mention(mentions, indicator.group(), report_id, name)

163

164 return mentions

165

166

167def get_officers_from_narrative(narrative: str) -> List[OfficerName]:

168 """Extract officer names from the narrative text.

169

170 :param narrative: Police report text

171 :returns: List of officer names

172 """

173 dgt5_re = OfficerName.dgt5_re

174 t_re = OfficerName.t_re

175 n_re = OfficerName.n_re

176 star_re = OfficerName.star_re

177 officer_regexes = [

178 # (1A23B) (Ofc.) John Doe #1234

179 r"(" + dgt5_re + ")?(" + t_re + ")?(" + n_re + "){1,2}(" + star_re + ")",

180 # (1A23B) Ofc. John Doe (#1234)

181 r"(" + dgt5_re + ")?" + t_re + "(" + n_re + ")+(" + star_re + ")?",

182 # (1A23B) Ofc. (John Doe) #1234

183 r"(" + dgt5_re + ")?" + t_re + "((" + n_re + ")+)?(" + star_re + ")",

184 # 1A23B

185 dgt5_re,

186 ]

187 p = re.compile("(" + ")|(".join(officer_regexes) + ")")

188 mentions = []

189 for m in p.finditer(narrative):

190 mentions.append(OfficerName(m.group()))

191

192 return mentions

Coverage for blind_charging/text_processing.py: 97%

88 statements