Coverage for blind_charging/locale/locale.py: 98%

60 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-17 20:36 +0000

1"""Localization utilities for redaction.""" 

2import re 

3from typing import Dict, Iterable, List 

4 

5from ..re_util import re_literal_group 

6from .const import USPS_STREET_ABBR 

7 

8# Track all instances to provide a lookup API 

9_REGISTRY: Dict[str, "Locale"] = {} 

10 

11 

12class Locale: 

13 """A collection of information specific to a city or region. 

14 

15 Provides match helpers to facilitate location-aware redaction. 

16 """ 

17 

18 def __new__(cls, *args, **kwargs) -> "Locale": 

19 inst = super().__new__(cls) 

20 _REGISTRY[args[0].lower()] = inst 

21 return inst 

22 

23 @staticmethod 

24 def get(name: str) -> "Locale": 

25 """Fetch a locale by name. 

26 

27 :param name: Name of locale 

28 :returns: Locale instance 

29 :raises ValueError: If locale doesn't exist 

30 """ 

31 locale = _REGISTRY.get(name.lower(), None) 

32 if not locale: 

33 raise ValueError("Locale {} not found".format(name)) 

34 return locale 

35 

36 @classmethod 

37 def _compile_district_re(cls, districts: Iterable[str]) -> re.Pattern: 

38 district_names_pattern = re_literal_group(districts) 

39 suffixes_pattern = re_literal_group( 

40 [ 

41 r"police station", 

42 r"station", 

43 r"district", 

44 r"unit", 

45 ] 

46 ) 

47 full_pattern = r"{}\s+{}".format(district_names_pattern, suffixes_pattern) 

48 return re.compile(full_pattern, re.IGNORECASE) 

49 

50 @classmethod 

51 def _compile_street_name_re(cls, street_names: Iterable[str]) -> re.Pattern: 

52 # Mask known street names that appear but might not have street indicators. 

53 # Only in strict situations (where we see "Streetname & Streetname" or 

54 # "Streetname and Streetname") 

55 street_variants = sum( 

56 [[name, name.capitalize(), name.upper()] for name in street_names], 

57 list[str](), 

58 ) 

59 street_group = re_literal_group(street_variants) 

60 ending_variants: List[str] = sum( 

61 [[abbr, abbr.capitalize(), abbr.upper()] for abbr in USPS_STREET_ABBR], 

62 list[str](), 

63 ) 

64 endings = re_literal_group(ending_variants) 

65 optional_endings = r"(?:\s+{})?".format(endings) 

66 single_street = r"{}{}".format(street_group, optional_endings) 

67 # Matches street names (g) separated by some "and" or "/" 

68 # Requires word boundaries before and after match (avoids matching "PD & FD") 

69 # <conj> used in `mask_known_street_name` 

70 # NOTE(jnu): search is case sensitive to avoid overmatching, such as "apple and cherry." 

71 intersection = ( 

72 r"(?<=\b){g}(?P<conj>(?:\s+(?:and|And|AND|\&)\s+)|\s*/\s*){g}(?=\b)".format( 

73 g=single_street 

74 ) 

75 ) 

76 return re.compile(intersection) 

77 

78 @classmethod 

79 def _compile_excluded_name_re(cls, excluded_names: Iterable[str]) -> re.Pattern: 

80 # Don't re.escape() names - assume important regex features are included 

81 pattern = r"|".join(excluded_names) 

82 return re.compile(pattern, re.IGNORECASE) 

83 

84 def __init__( 

85 self, 

86 name: str, 

87 police_districts: Iterable[str], 

88 street_names: Iterable[str], 

89 excluded_names: Iterable[str], 

90 neighborhoods: Iterable[str], 

91 indicators: Dict, 

92 indicator_position: str, 

93 ): 

94 self.name = name 

95 self._district_re = self._compile_district_re(police_districts) 

96 self._street_name_re = self._compile_street_name_re(street_names) 

97 self._excluded_name_re = self._compile_excluded_name_re(excluded_names) 

98 self.neighborhoods = neighborhoods 

99 self.indicators = indicators 

100 self.indicator_position = indicator_position 

101 

102 def match_district(self, text: str) -> Iterable[re.Match]: 

103 """Find police district names within the text.""" 

104 return self._district_re.finditer(text) 

105 

106 def match_street_name(self, text: str) -> Iterable[re.Match]: 

107 """Find known street names within the text.""" 

108 return self._street_name_re.finditer(text) 

109 

110 def filter_names(self, persons: Iterable[dict]) -> Iterable[dict]: 

111 """Trim and remove ineligible names from inputted persons list.""" 

112 

113 filtered_persons = [] 

114 for person in persons: 

115 # remove person if name literal is missing 

116 if not person["name"]: 

117 continue 

118 

119 person["name"] = person["name"].strip().lower() 

120 

121 if person["name"] in ["", "n/a", "na", "none", "missing"]: 

122 continue 

123 

124 # remove person if name is excluded 

125 if self._excluded_name_re.match(person["name"]): 

126 continue 

127 

128 filtered_persons.append(person) 

129 

130 return filtered_persons