Coverage for blind_charging/mask_const.py: 100%
26 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-17 20:36 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-17 20:36 +0000
1import re
3# from https://gist.github.com/marijn/274449
4COUNTRIES = [
5 "Afghanistan",
6 "Albania",
7 "Algeria",
8 "American Samoa",
9 "Andorra",
10 "Angola",
11 "Anguilla",
12 "Antarctica",
13 "Antigua And Barbuda",
14 "Argentina",
15 "Armenia",
16 "Aruba",
17 "Australia",
18 "Austria",
19 "Azerbaijan",
20 "Bahamas",
21 "Bahrain",
22 "Bangladesh",
23 "Barbados",
24 "Belarus",
25 "Belgium",
26 "Belize",
27 "Benin",
28 "Bermuda",
29 "Bhutan",
30 "Bolivia",
31 "Bosnia And Herzegovina",
32 "Botswana",
33 "Bouvet Island",
34 "Brazil",
35 "British Indian Ocean Territory",
36 "Brunei Darussalam",
37 "Bulgaria",
38 "Burkina Faso",
39 "Burundi",
40 "Cambodia",
41 "Cameroon",
42 "Canada",
43 "Cape Verde",
44 "Cayman Islands",
45 "Central African Republic",
46 "Chad",
47 "Chile",
48 "China, People's Republic Of",
49 "Christmas Island",
50 "Cocos (Keeling) Islands",
51 "Colombia",
52 "Comoros",
53 "Congo",
54 "Congo, The Democratic Republic Of The",
55 "Cook Islands",
56 "Costa Rica",
57 "Cote D'ivoire",
58 "Croatia",
59 "Cuba",
60 "Cyprus",
61 "Czech Republic",
62 "Denmark",
63 "Djibouti",
64 "Dominica",
65 "Dominican Republic",
66 "East Timor",
67 "Ecuador",
68 "Egypt",
69 "El Salvador",
70 "Equatorial Guinea",
71 "Eritrea",
72 "Estonia",
73 "Ethiopia",
74 "Falkland Islands (Malvinas)",
75 "Faroe Islands",
76 "Fiji",
77 "Finland",
78 "France",
79 "French Guiana",
80 "French Polynesia",
81 "French Southern Territories",
82 "Gabon",
83 "Gambia",
84 "Georgia",
85 "Germany",
86 "Ghana",
87 "Gibraltar",
88 "Greece",
89 "Greenland",
90 "Grenada",
91 "Guadeloupe",
92 "Guam",
93 "Guatemala",
94 "Guinea",
95 "Guinea-bissau",
96 "Guyana",
97 "Haiti",
98 "Heard Island And Mcdonald Islands",
99 "Holy See (Vatican City State)",
100 "Honduras",
101 "Hong Kong",
102 "Hungary",
103 "Iceland",
104 "India",
105 "Indonesia",
106 "Iran, Islamic Republic Of",
107 "Iraq",
108 "Ireland",
109 "Israel",
110 "Italy",
111 "Jamaica",
112 "Japan",
113 "Jordan",
114 "Kazakstan",
115 "Kenya",
116 "Kiribati",
117 "Korea, Democratic People's Republic Of",
118 "Korea, Republic Of",
119 "Kosovo",
120 "Kuwait",
121 "Kyrgyzstan",
122 "Lao People's Democratic Republic",
123 "Latvia",
124 "Lebanon",
125 "Lesotho",
126 "Liberia",
127 "Libyan Arab Jamahiriya",
128 "Liechtenstein",
129 "Lithuania",
130 "Luxembourg",
131 "Macau",
132 "Macedonia, The Former Yugoslav Republic Of",
133 "Madagascar",
134 "Malawi",
135 "Malaysia",
136 "Maldives",
137 "Mali",
138 "Malta",
139 "Marshall Islands",
140 "Martinique",
141 "Mauritania",
142 "Mauritius",
143 "Mayotte",
144 "Mexico",
145 "Micronesia, Federated States Of",
146 "Moldova, Republic Of",
147 "Monaco",
148 "Mongolia",
149 "Montserrat",
150 "Montenegro",
151 "Morocco",
152 "Mozambique",
153 "Myanmar",
154 "Namibia",
155 "Nauru",
156 "Nepal",
157 "Netherlands",
158 "Netherlands Antilles",
159 "New Caledonia",
160 "New Zealand",
161 "Nicaragua",
162 "Niger",
163 "Nigeria",
164 "Niue",
165 "Norfolk Island",
166 "Northern Mariana Islands",
167 "Norway",
168 "Oman",
169 "Pakistan",
170 "Palau",
171 "Palestinian Territory, Occupied",
172 "Panama",
173 "Papua New Guinea",
174 "Paraguay",
175 "Peru",
176 "Philippines",
177 "Pitcairn",
178 "Poland",
179 "Portugal",
180 "Puerto Rico",
181 "Qatar",
182 "Reunion",
183 "Romania",
184 "Russian Federation",
185 "Rwanda",
186 "Saint Helena",
187 "Saint Kitts And Nevis",
188 "Saint Lucia",
189 "Saint Pierre And Miquelon",
190 "Saint Vincent And The Grenadines",
191 "Samoa",
192 "San Marino",
193 "Sao Tome And Principe",
194 "Saudi Arabia",
195 "Senegal",
196 "Serbia",
197 "Seychelles",
198 "Sierra Leone",
199 "Singapore",
200 "Slovakia",
201 "Slovenia",
202 "Solomon Islands",
203 "Somalia",
204 "South Africa",
205 "South Georgia And The South Sandwich Islands",
206 "Spain",
207 "Sri Lanka",
208 "Sudan",
209 "Suriname",
210 "Svalbard And Jan Mayen",
211 "Swaziland",
212 "Sweden",
213 "Switzerland",
214 "Syrian Arab Republic",
215 "Taiwan, Province Of China",
216 "Tajikistan",
217 "Tanzania, United Republic Of",
218 "Thailand",
219 "Togo",
220 "Tokelau",
221 "Tonga",
222 "Trinidad And Tobago",
223 "Tunisia",
224 "Turkey",
225 "Turkmenistan",
226 "Turks And Caicos Islands",
227 "Tuvalu",
228 "Uganda",
229 "Ukraine",
230 "United Arab Emirates",
231 "United Kingdom",
232 "United States",
233 "United States Minor Outlying Islands",
234 "Uruguay",
235 "Uzbekistan",
236 "Vanuatu",
237 "Venezuela",
238 "Viet Nam",
239 "Virgin Islands, British",
240 "Virgin Islands, U.s.",
241 "Wallis And Futuna",
242 "Western Sahara",
243 "Yemen",
244 "Zambia",
245 "Zimbabwe",
246]
248# Additions
249# - countries that are often used in truncated or abbreviated form
250COUNTRIES.extend(
251 [
252 "Bosnia",
253 "China",
254 "Congo",
255 "Guinea",
256 "Vatican",
257 "Iran",
258 "Korea",
259 "North Korea",
260 "South Korea",
261 "Macedonia",
262 "Micronesia",
263 "Moldova",
264 "Palestine",
265 "Palestinian Territory",
266 "Russia",
267 "South Georgia",
268 "Syria",
269 "Taiwan",
270 "Tanzania",
271 "Vietnam",
272 "Virgin Islands",
273 ]
274)
275# - rearranged strings of countries with extended names (removing commas)
276COUNTRIES.extend(
277 [
278 "People's Republic of China",
279 "Democratic Republic of the Congo",
280 "Islamic Republic of Iran",
281 "Democratic People's Republic of Korea",
282 "Republic of Korea",
283 "Former Yugoslav Republic of Macedonia",
284 "Federated States of Micronesia",
285 "Republic of Moldova",
286 "Occupied Palestinian Territory",
287 "United Republic of Tanzania",
288 "British Virgin Islands",
289 "U.S. Virgin Islands",
290 ]
291)
293# from https://gist.github.com/marijn/274449
294NATIONALITIES = [
295 "Afghan",
296 "Albanian",
297 "Algerian",
298 "American",
299 "Andorran",
300 "Angolan",
301 "Antiguans",
302 "Argentinean",
303 "Armenian",
304 "Australian",
305 "Austrian",
306 "Azerbaijani",
307 "Bahamian",
308 "Bahraini",
309 "Bangladeshi",
310 "Barbadian",
311 "Barbudans",
312 "Batswana",
313 "Belarusian",
314 "Belgian",
315 "Belizean",
316 "Beninese",
317 "Bhutanese",
318 "Bolivian",
319 "Bosnian",
320 "Brazilian",
321 "British",
322 "Bruneian",
323 "Bulgarian",
324 "Burkinabe",
325 "Burmese",
326 "Burundian",
327 "Cambodian",
328 "Cameroonian",
329 "Canadian",
330 "Cape Verdean",
331 "Central African",
332 "Chadian",
333 "Chilean",
334 "Chinese",
335 "Colombian",
336 "Comoran",
337 "Congolese",
338 "Costa Rican",
339 "Croatian",
340 "Cuban",
341 "Cypriot",
342 "Czech",
343 "Danish",
344 "Djibouti",
345 "Dominican",
346 "Dutch",
347 "East Timorese",
348 "Ecuadorean",
349 "Egyptian",
350 "Emirian",
351 "Equatorial Guinean",
352 "Eritrean",
353 "Estonian",
354 "Ethiopian",
355 "Fijian",
356 "Filipino",
357 "Finnish",
358 "French",
359 "Gabonese",
360 "Gambian",
361 "Georgian",
362 "German",
363 "Ghanaian",
364 "Greek",
365 "Grenadian",
366 "Guatemalan",
367 "Guinea-Bissauan",
368 "Guinean",
369 "Guyanese",
370 "Haitian",
371 "Herzegovinian",
372 "Honduran",
373 "Hungarian",
374 "I-Kiribati",
375 "Icelander",
376 "Indian",
377 "Indonesian",
378 "Iranian",
379 "Iraqi",
380 "Irish",
381 "Israeli",
382 "Italian",
383 "Ivorian",
384 "Jamaican",
385 "Japanese",
386 "Jordanian",
387 "Kazakhstani",
388 "Kenyan",
389 "Kittian and Nevisian",
390 "Kuwaiti",
391 "Kyrgyz",
392 "Laotian",
393 "Latvian",
394 "Lebanese",
395 "Liberian",
396 "Libyan",
397 "Liechtensteiner",
398 "Lithuanian",
399 "Luxembourger",
400 "Macedonian",
401 "Malagasy",
402 "Malawian",
403 "Malaysian",
404 "Maldivian",
405 "Malian",
406 "Maltese",
407 "Marshallese",
408 "Mauritanian",
409 "Mauritian",
410 "Mexican",
411 "Micronesian",
412 "Moldovan",
413 "Monacan",
414 "Mongolian",
415 "Moroccan",
416 "Mosotho",
417 "Motswana",
418 "Mozambican",
419 "Namibian",
420 "Nauruan",
421 "Nepalese",
422 "New Zealander",
423 "Ni-Vanuatu",
424 "Nicaraguan",
425 "Nigerian",
426 "Nigerien",
427 "North Korean",
428 "Northern Irish",
429 "Norwegian",
430 "Omani",
431 "Pakistani",
432 "Palauan",
433 "Panamanian",
434 "Papua New Guinean",
435 "Paraguayan",
436 "Peruvian",
437 "Polish",
438 "Portuguese",
439 "Qatari",
440 "Romanian",
441 "Russian",
442 "Rwandan",
443 "Saint Lucian",
444 "Salvadoran",
445 "Samoan",
446 "San Marinese",
447 "Sao Tomean",
448 "Saudi",
449 "Scottish",
450 "Senegalese",
451 "Serbian",
452 "Seychellois",
453 "Sierra Leonean",
454 "Singaporean",
455 "Slovakian",
456 "Slovenian",
457 "Solomon Islander",
458 "Somali",
459 "South African",
460 "South Korean",
461 "Spanish",
462 "Sri Lankan",
463 "Sudanese",
464 "Surinamer",
465 "Swazi",
466 "Swedish",
467 "Swiss",
468 "Syrian",
469 "Taiwanese",
470 "Tajik",
471 "Tanzanian",
472 "Thai",
473 "Togolese",
474 "Tongan",
475 "Trinidadian or Tobagonian",
476 "Tunisian",
477 "Turkish",
478 "Tuvaluan",
479 "Ugandan",
480 "Ukrainian",
481 "Uruguayan",
482 "Uzbekistani",
483 "Venezuelan",
484 "Vietnamese",
485 "Welsh",
486 "Yemenite",
487 "Zambian",
488 "Zimbabwea",
489]
491# Random additions that don't appear in the above list
492NATIONALITIES.extend(
493 ["English", "Jewish", "Korean", "Kurdish", "Persian", "Puerto Rican", "Philippine"]
494)
496# from https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers
497LANGUAGES = [
498 "Chinese",
499 "Mandarin",
500 "Spanish",
501 "English",
502 "Hindi",
503 "Arabic",
504 "Bengali",
505 "Portuguese",
506 "Russian",
507 "Japanese",
508 "Lahnda",
509 "Western Punjabi",
510 "Marathi",
511 "Telugu",
512 "Wu",
513 "Malay",
514 "Turkish",
515 "Korean",
516 "French",
517 "German",
518 "Vietnamese",
519 "Tamil",
520 "Yue",
521 "Urdu",
522 "Javanese",
523 "Italian",
524 "Egyptian",
525 "Persian",
526 "Gujarati",
527 "Iranian Persian",
528 "Bhojpuri",
529 "Min Nan",
530 "Hakka",
531 "Jinyu",
532 "Hausa",
533 "Kannada",
534 "Indonesian",
535 "Polish",
536 "Pushto",
537 "Yoruba",
538 "Xiang Chinese",
539 "Malayalam",
540 "Oriya",
541 "Odia",
542 "Maithili",
543 "Burmese",
544 "Eastern Punjabi",
545 "Sudanese",
546 "Fulah",
547 "Uzbek",
548 "Algerian",
549 "Moroccan",
550 "Ukrainian",
551 "Igbo",
552 "Northern Uzbek",
553 "Sindhi",
554 "North Levantine",
555 "Romanian",
556 "Tagalog",
557 "Dutch",
558 "Azerbaijani",
559 "Sa Ωidi",
560 "Kurdish",
561 "Gan",
562 "Amharic",
563 "Northern Pashto",
564 "Magahi",
565 "Thai",
566 "Marwari",
567 "Saraiki",
568 "Malagasy",
569 "Oromo",
570 "Serbo-Croatian",
571 "Khmer",
572 "Chhattisgarhi",
573 "Somali",
574 "Malay",
575 "Cebuano",
576 "Nepali",
577 "Mesopotamian",
578 "Assamese",
579 "Sinhala",
580 "Zhuang",
581 "Northern Kurdish",
582 "Hijazi",
583 "Nigerian Fulfulde",
584 "South Azerbaijani",
585 "Greek",
586 "Chittagonian",
587 "Kazakh",
588 "Deccan",
589 "Hungarian",
590 "Kinyarwanda",
591 "Zulu",
592 "South Levantine",
593 "Tunisian",
594 "Sanaani",
595 "Min Bei Chinese",
596 "Southern Pashto",
597 "Rundi",
598 "Czech",
599 "Ta Ωizzi-Adeni",
600 "Uyghur",
601 "Min Dong Chinese",
602 "Sylheti",
603 "Baluchi",
604]
606LANGUAGES.extend(["Levantine", "Pashto", "Fula", "Min Bei", "Cantonese"])
608# Words that describe a race outside of an ethnicity
609RACE_WORDS = {
610 "african american",
611 "african",
612 "hispanic",
613 "latin",
614 "latino",
615 "latina",
616 "latinx",
617 "asian",
618 "native american",
619 "indigenous",
620}
621# Adds "-american" and "american" to RACE_WORDS that lack them
622RACE_WORDS.update(
623 [
624 f"{race_word} american"
625 for race_word in RACE_WORDS
626 if re.search("american", race_word) is None
627 ]
628)
629RACE_WORDS.update(
630 [
631 f"{race_word}-american"
632 for race_word in RACE_WORDS
633 if re.search("american", race_word) is None
634 ]
635)
637# Race abbreviation options:
638# - Race: A = Asian, B = Black, H = Hispanic, O = Oriental, PI = Pacific Islander, W = White
639# - Gender: F = Female, M = Male
640# - Age: A = Adult, J = Juvenile
641RACE_ABBREV = "([ABHOW]|PI)(F|M)(A|J)"
643WEB_COLORS = [
644 "#FF0000",
645 "#0000FF",
646 "#008000",
647 "#FFA500",
648 "#800080",
649 "#FF00FF",
650 "#C0C0C0",
651 "#A52A2A",
652 "#800000",
653 "#808000",
654]
656# TODO: foresee issue of abbreviated skin colors e.g. BLK (black), BLN (blonde), BRN (brown)
657SKIN_COLORS = {"black", "white", "brown", "latin", "dark", "pale", "light"}
659PERSON_REF = {
660 "boy",
661 "boys",
662 "man",
663 "men",
664 "male",
665 "males",
666 "gentleman",
667 "gentlemen",
668 "girl",
669 "girls",
670 "woman",
671 "women",
672 "female",
673 "females",
674 "lady",
675 "ladies",
676 "person",
677 "persons",
678 "people",
679 "kid",
680 "kids",
681 "child",
682 "children",
683 "adult",
684 "adults",
685 "complexion",
686 "guy",
687}
689HAIR_COLORS = {
690 "black",
691 "white",
692 "gray",
693 "grey",
694 "brown",
695 "bald",
696 "balding",
697 "brunette",
698 "blond",
699 "blonde",
700 "red",
701 "orange",
702 "auburn",
703 "platinum",
704 "ombre",
705 "strawberry blond",
706 "strawberry blonde",
707 "salt and pepper",
708 "salt & pepper",
709 "bleached",
710 "dyed",
711 "light",
712 "dark",
713 "silver",
714}
717# Terms related to hair that might convey race and should be redacted
718SENSITIVE_HAIR_REF = {
719 "dreadlocks",
720 "dread locks",
721 "dread-locks",
722 "cornrows",
723 "corn rows",
724 "corn-rows",
725 "afro",
726 "nappy",
727}
729# Terms used to indicate that we're discussing hair.
730# See also SENSITIVE_HAIR_REF.
731HAIR_REF = {
732 "hair",
733 "haircut",
734 "hairstyle",
735 "hair style",
736 "hairdo",
737 "moustache",
738 "beard",
739 "goatee",
740 "sideburns",
741 "mullet",
742 "bun",
743 "ponytail",
744 "pony tail",
745 "braids",
746 "mohawk",
747 "fade",
748 "crew cut",
749} | SENSITIVE_HAIR_REF
751# Adjectives that are often used to describe hair
752HAIR_ADJS = {
753 "short",
754 "curly",
755 "long",
756 "wavy",
757 "shaggy",
758 "matted",
759 "frizzy",
760 "scraggly",
761 "shoulder-length",
762 "shoulder length",
763 "straight",
764 "wiry",
765 "spiky",
766 "bushy",
767 "close cropped",
768 "close-cropped",
769 "cropped",
770 "style",
771 "type",
772}
774EYE_COLORS = {
775 "gray",
776 "grey",
777 "blue",
778 "light blue",
779 "green",
780 "hazel",
781 "black",
782 "brown",
783 "pink",
784}
786EYE_REF = {
787 "eye",
788 "eyes",
789}
791# Features correlated with race that are always redacted
792RACE_FEATURES = {
793 "latin",
794 "brunette",
795 "blond",
796 "blonde",
797}
799GENERAL_COLORS = {
800 "black",
801 "white",
802 "gray",
803 "grey",
804 "red",
805 "red-orange",
806 "red orange",
807 "orange-red",
808 "orange red",
809 "orange",
810 "yellow",
811 "green",
812 "blue",
813 "indigo",
814 "turquoise",
815 "violet",
816 "purple",
817 "pink",
818 "magenta",
819 "amber",
820}
822APPEARANCE_LIST = {"race", "complexion", "eyes", "hair"}
824NAME_PHRASES = {
825 "name of",
826 "named",
827 "name",
828 "brother",
829 "sister",
830 "father",
831 "mother",
832 "dad",
833 "mom",
834 "female",
835 "male",
836 "girlfriend",
837 "boyfriend",
838 "wife",
839 "husband",
840 "friend",
841}
843#####################################
844# WARNING: List of racial slurs below
845#####################################
848# Slurs may appear in narratives, especially in quotes. They obviously convey
849# racial information and so we should redact them.
850# In no way does SCPL condone the use of any of these terms.
851SLURS = {
852 "beaner",
853 "chink",
854 "cracker",
855 "coon",
856 "hooknose",
857 "kike",
858 "kyke",
859 "koon",
860 "jap",
861 "nigger",
862 "redskin",
863 "sandnigger",
864 "spic",
865 "wetback",
866}