CleanPhoneText.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.sax;

import java.util.ArrayList;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Class to help de-obfuscate phone numbers in text.
 */
public class CleanPhoneText {
    public static final String[][][] cleanSubstitutions =
            new String[][][]{{{"&#\\d{1,3};", ""}},         // first simply remove numeric entities
                    {{"th0usand", "thousand"},    // handle common misspellings
                            {"th1rteen", "thirteen"}, {"f0urteen", "fourteen"},
                            {"e1ghteen", "eighteen"}, {"n1neteen", "nineteen"},
                            {"f1fteen", "fifteen"}, {"s1xteen", "sixteen"}, {"th1rty", "thirty"},
                            {"e1ghty", "eighty"}, {"n1nety", "ninety"}, {"fourty", "forty"},
                            {"f0urty", "forty"}, {"e1ght", "eight"}, {"f0rty", "forty"},
                            {"f1fty", "fifty"}, {"s1xty", "sixty"}, {"zer0", "zero"},
                            {"f0ur", "four"}, {"f1ve", "five"}, {"n1ne", "nine"}, {"0ne", "one"},
                            {"tw0", "two"}, {"s1x", "six"}},
                    // mixed compound numeral words
                    // consider 7teen, etc.
                    {{"twenty[\\W_]{0,3}1", "twenty-one"}, {"twenty[\\W_]{0,3}2", "twenty-two"},
                            {"twenty[\\W_]{0,3}3", "twenty-three"},
                            {"twenty[\\W_]{0,3}4", "twenty-four"},
                            {"twenty[\\W_]{0,3}5", "twenty-five"},
                            {"twenty[\\W_]{0,3}6", "twenty-six"},
                            {"twenty[\\W_]{0,3}7", "twenty-seven"},
                            {"twenty[\\W_]{0,3}8", "twenty-eight"},
                            {"twenty[\\W_]{0,3}9", "twenty-nine"},
                            {"thirty[\\W_]{0,3}1", "thirty-one"},
                            {"thirty[\\W_]{0,3}2", "thirty-two"},
                            {"thirty[\\W_]{0,3}3", "thirty-three"},
                            {"thirty[\\W_]{0,3}4", "thirty-four"},
                            {"thirty[\\W_]{0,3}5", "thirty-five"},
                            {"thirty[\\W_]{0,3}6", "thirty-six"},
                            {"thirty[\\W_]{0,3}7", "thirty-seven"},
                            {"thirty[\\W_]{0,3}8", "thirty-eight"},
                            {"thirty[\\W_]{0,3}9", "thirty-nine"},
                            {"forty[\\W_]{0,3}1", "forty-one"}, {"forty[\\W_]{0,3}2", "forty-two"},
                            {"forty[\\W_]{0,3}3", "forty-three"},
                            {"forty[\\W_]{0,3}4", "forty-four"},
                            {"forty[\\W_]{0,3}5", "forty-five"}, {"forty[\\W_]{0,3}6", "forty-six"},
                            {"forty[\\W_]{0,3}7", "forty-seven"},
                            {"forty[\\W_]{0,3}8", "forty-eight"},
                            {"forty[\\W_]{0,3}9", "forty-nine"}, {"fifty[\\W_]{0,3}1", "fifty-one"},
                            {"fifty[\\W_]{0,3}2", "fifty-two"},
                            {"fifty[\\W_]{0,3}3", "fifty-three"},
                            {"fifty[\\W_]{0,3}4", "fifty-four"},
                            {"fifty[\\W_]{0,3}5", "fifty-five"}, {"fifty[\\W_]{0,3}6", "fifty-six"},
                            {"fifty[\\W_]{0,3}7", "fifty-seven"},
                            {"fifty[\\W_]{0,3}8", "fifty-eight"},
                            {"fifty[\\W_]{0,3}9", "fifty-nine"}, {"sixty[\\W_]{0,3}1", "sixty-one"},
                            {"sixty[\\W_]{0,3}2", "sixty-two"},
                            {"sixty[\\W_]{0,3}3", "sixty-three"},
                            {"sixty[\\W_]{0,3}4", "sixty-four"},
                            {"sixty[\\W_]{0,3}5", "sixty-five"}, {"sixty[\\W_]{0,3}6", "sixty-six"},
                            {"sixty[\\W_]{0,3}7", "sixty-seven"},
                            {"sixty[\\W_]{0,3}8", "sixty-eight"},
                            {"sixty[\\W_]{0,3}9", "sixty-nine"},
                            {"seventy[\\W_]{0,3}1", "seventy-one"},
                            {"seventy[\\W_]{0,3}2", "seventy-two"},
                            {"seventy[\\W_]{0,3}3", "seventy-three"},
                            {"seventy[\\W_]{0,3}4", "seventy-four"},
                            {"seventy[\\W_]{0,3}5", "seventy-five"},
                            {"seventy[\\W_]{0,3}6", "seventy-six"},
                            {"seventy[\\W_]{0,3}7", "seventy-seven"},
                            {"seventy[\\W_]{0,3}8", "seventy-eight"},
                            {"seventy[\\W_]{0,3}9", "seventy-nine"},
                            {"eighty[\\W_]{0,3}1", "eighty-one"},
                            {"eighty[\\W_]{0,3}2", "eighty-two"},
                            {"eighty[\\W_]{0,3}3", "eighty-three"},
                            {"eighty[\\W_]{0,3}4", "eighty-four"},
                            {"eighty[\\W_]{0,3}5", "eighty-five"},
                            {"eighty[\\W_]{0,3}6", "eighty-six"},
                            {"eighty[\\W_]{0,3}7", "eighty-seven"},
                            {"eighty[\\W_]{0,3}8", "eighty-eight"},
                            {"eighty[\\W_]{0,3}9", "eighty-nine"},
                            {"ninety[\\W_]{0,3}1", "ninety-one"},
                            {"ninety[\\W_]{0,3}2", "ninety-two"},
                            {"ninety[\\W_]{0,3}3", "ninety-three"},
                            {"ninety[\\W_]{0,3}4", "ninety-four"},
                            {"ninety[\\W_]{0,3}5", "ninety-five"},
                            {"ninety[\\W_]{0,3}6", "ninety-six"},
                            {"ninety[\\W_]{0,3}7", "ninety-seven"},
                            {"ninety[\\W_]{0,3}8", "ninety-eight"},
                            {"ninety[\\W_]{0,3}9", "ninety-nine"}},
                    // now resolve compound numeral words
                    {{"twenty-one", "21"}, {"twenty-two", "22"}, {"twenty-three", "23"},
                            {"twenty-four", "24"}, {"twenty-five", "25"}, {"twenty-six", "26"},
                            {"twenty-seven", "27"}, {"twenty-eight", "28"}, {"twenty-nine", "29"},
                            {"thirty-one", "31"}, {"thirty-two", "32"}, {"thirty-three", "33"},
                            {"thirty-four", "34"}, {"thirty-five", "35"}, {"thirty-six", "36"},
                            {"thirty-seven", "37"}, {"thirty-eight", "38"}, {"thirty-nine", "39"},
                            {"forty-one", "41"}, {"forty-two", "42"}, {"forty-three", "43"},
                            {"forty-four", "44"}, {"forty-five", "45"}, {"forty-six", "46"},
                            {"forty-seven", "47"}, {"forty-eight", "48"}, {"forty-nine", "49"},
                            {"fifty-one", "51"}, {"fifty-two", "52"}, {"fifty-three", "53"},
                            {"fifty-four", "54"}, {"fifty-five", "55"}, {"fifty-six", "56"},
                            {"fifty-seven", "57"}, {"fifty-eight", "58"}, {"fifty-nine", "59"},
                            {"sixty-one", "61"}, {"sixty-two", "62"}, {"sixty-three", "63"},
                            {"sixty-four", "64"}, {"sixty-five", "65"}, {"sixty-six", "66"},
                            {"sixty-seven", "67"}, {"sixty-eight", "68"}, {"sixty-nine", "69"},
                            {"seventy-one", "71"}, {"seventy-two", "72"}, {"seventy-three", "73"},
                            {"seventy-four", "74"}, {"seventy-five", "75"}, {"seventy-six", "76"},
                            {"seventy-seven", "77"}, {"seventy-eight", "78"},
                            {"seventy-nine", "79"}, {"eighty-one", "81"}, {"eighty-two", "82"},
                            {"eighty-three", "83"}, {"eighty-four", "84"}, {"eighty-five", "85"},
                            {"eighty-six", "86"}, {"eighty-seven", "87"}, {"eighty-eight", "88"},
                            {"eighty-nine", "89"}, {"ninety-one", "91"}, {"ninety-two", "92"},
                            {"ninety-three", "93"}, {"ninety-four", "94"}, {"ninety-five", "95"},
                            {"ninety-six", "96"}, {"ninety-seven", "97"}, {"ninety-eight", "98"},
                            {"ninety-nine", "99"}},
                    // larger units function as suffixes now
                    // assume never have three hundred four, three hundred and four
                    {{"hundred", "00"}, {"thousand", "000"}},
                    // single numeral words now
                    // some would have been ambiguous
                    {{"seventeen", "17"}, {"thirteen", "13"}, {"fourteen", "14"},
                            {"eighteen", "18"}, {"nineteen", "19"}, {"fifteen", "15"},
                            {"sixteen", "16"}, {"seventy", "70"}, {"eleven", "11"},
                            {"twelve", "12"}, {"twenty", "20"}, {"thirty", "30"}, {"eighty", "80"},
                            {"ninety", "90"}, {"three", "3"}, {"seven", "7"}, {"eight", "8"},
                            {"forty", "40"}, {"fifty", "50"}, {"sixty", "60"}, {"zero", "0"},
                            {"four", "4"}, {"five", "5"}, {"nine", "9"}, {"one", "1"}, {"two", "2"},
                            {"six", "6"}, {"ten", "10"}},
                    // now do letter for digit substitutions
                    {{"oh", "0"}, {"o", "0"}, {"i", "1"}, {"l", "1"}}};
    // Regex to identify a phone number
    static final String cleanPhoneRegex = "([2-9]\\d{2}[2-9]\\d{6})";
    // Regex which attempts to ignore punctuation and other distractions.
    static final String phoneRegex =
            "([{(<]{0,3}[2-9][\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,6}" +
                    "[2-9][\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,6}\\d" +
                    "[\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,3}\\d)";

    public static ArrayList<String> extractPhoneNumbers(String text) {
        text = clean(text);
        int idx = 0;
        Pattern p = Pattern.compile(cleanPhoneRegex);
        Matcher m = p.matcher(text);
        ArrayList<String> phoneNumbers = new ArrayList<>();
        while (m.find(idx)) {
            String digits = m.group(1);
            int start = m.start(1);
            int end = m.end(1);
            String prefix = "";
            if (start > 0) {
                prefix = text.substring(start - 1, start);
            }
            if (digits.startsWith("82") && prefix.equals("*")) {
                // this number overlaps with a *82 sequence
                idx += 2;
            } else {
                // seems good
                phoneNumbers.add(digits);
                idx = end;
            }
        }
        return phoneNumbers;
    }

    public static String clean(String text) {
        text = text.toLowerCase(Locale.ROOT);
        for (String[][] group : cleanSubstitutions) {
            for (String[] sub : group) {
                text = text.replaceAll(sub[0], sub[1]);
            }
        }
        // Delete all non-digits and white space.
        text = text.replaceAll("[\\D+\\s]", "");
        return text;
    }
}