Coverage for blind_charging/locale/locale.py: 98%
60 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-17 20:36 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-17 20:36 +0000
1"""Localization utilities for redaction."""
2import re
3from typing import Dict, Iterable, List
5from ..re_util import re_literal_group
6from .const import USPS_STREET_ABBR
8# Track all instances to provide a lookup API
9_REGISTRY: Dict[str, "Locale"] = {}
12class Locale:
13 """A collection of information specific to a city or region.
15 Provides match helpers to facilitate location-aware redaction.
16 """
18 def __new__(cls, *args, **kwargs) -> "Locale":
19 inst = super().__new__(cls)
20 _REGISTRY[args[0].lower()] = inst
21 return inst
23 @staticmethod
24 def get(name: str) -> "Locale":
25 """Fetch a locale by name.
27 :param name: Name of locale
28 :returns: Locale instance
29 :raises ValueError: If locale doesn't exist
30 """
31 locale = _REGISTRY.get(name.lower(), None)
32 if not locale:
33 raise ValueError("Locale {} not found".format(name))
34 return locale
36 @classmethod
37 def _compile_district_re(cls, districts: Iterable[str]) -> re.Pattern:
38 district_names_pattern = re_literal_group(districts)
39 suffixes_pattern = re_literal_group(
40 [
41 r"police station",
42 r"station",
43 r"district",
44 r"unit",
45 ]
46 )
47 full_pattern = r"{}\s+{}".format(district_names_pattern, suffixes_pattern)
48 return re.compile(full_pattern, re.IGNORECASE)
50 @classmethod
51 def _compile_street_name_re(cls, street_names: Iterable[str]) -> re.Pattern:
52 # Mask known street names that appear but might not have street indicators.
53 # Only in strict situations (where we see "Streetname & Streetname" or
54 # "Streetname and Streetname")
55 street_variants = sum(
56 [[name, name.capitalize(), name.upper()] for name in street_names],
57 list[str](),
58 )
59 street_group = re_literal_group(street_variants)
60 ending_variants: List[str] = sum(
61 [[abbr, abbr.capitalize(), abbr.upper()] for abbr in USPS_STREET_ABBR],
62 list[str](),
63 )
64 endings = re_literal_group(ending_variants)
65 optional_endings = r"(?:\s+{})?".format(endings)
66 single_street = r"{}{}".format(street_group, optional_endings)
67 # Matches street names (g) separated by some "and" or "/"
68 # Requires word boundaries before and after match (avoids matching "PD & FD")
69 # <conj> used in `mask_known_street_name`
70 # NOTE(jnu): search is case sensitive to avoid overmatching, such as "apple and cherry."
71 intersection = (
72 r"(?<=\b){g}(?P<conj>(?:\s+(?:and|And|AND|\&)\s+)|\s*/\s*){g}(?=\b)".format(
73 g=single_street
74 )
75 )
76 return re.compile(intersection)
78 @classmethod
79 def _compile_excluded_name_re(cls, excluded_names: Iterable[str]) -> re.Pattern:
80 # Don't re.escape() names - assume important regex features are included
81 pattern = r"|".join(excluded_names)
82 return re.compile(pattern, re.IGNORECASE)
84 def __init__(
85 self,
86 name: str,
87 police_districts: Iterable[str],
88 street_names: Iterable[str],
89 excluded_names: Iterable[str],
90 neighborhoods: Iterable[str],
91 indicators: Dict,
92 indicator_position: str,
93 ):
94 self.name = name
95 self._district_re = self._compile_district_re(police_districts)
96 self._street_name_re = self._compile_street_name_re(street_names)
97 self._excluded_name_re = self._compile_excluded_name_re(excluded_names)
98 self.neighborhoods = neighborhoods
99 self.indicators = indicators
100 self.indicator_position = indicator_position
102 def match_district(self, text: str) -> Iterable[re.Match]:
103 """Find police district names within the text."""
104 return self._district_re.finditer(text)
106 def match_street_name(self, text: str) -> Iterable[re.Match]:
107 """Find known street names within the text."""
108 return self._street_name_re.finditer(text)
110 def filter_names(self, persons: Iterable[dict]) -> Iterable[dict]:
111 """Trim and remove ineligible names from inputted persons list."""
113 filtered_persons = []
114 for person in persons:
115 # remove person if name literal is missing
116 if not person["name"]:
117 continue
119 person["name"] = person["name"].strip().lower()
121 if person["name"] in ["", "n/a", "na", "none", "missing"]:
122 continue
124 # remove person if name is excluded
125 if self._excluded_name_re.match(person["name"]):
126 continue
128 filtered_persons.append(person)
130 return filtered_persons