RegexCaptureParser.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.config.ConfigDeserializer;
import org.apache.tika.config.JsonConfig;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;

@TikaComponent(spi = false)
public class RegexCaptureParser implements Parser {

    private static final Set<MediaType> SUPPORTED_TYPES =
            Collections.singleton(MediaType.TEXT_PLAIN);

    private final RegexCaptureParserConfig config;
    private final Map<String, Pattern> captureMap;
    private final Map<String, Pattern> matchMap;
    private final boolean writeContent;

    public RegexCaptureParser() {
        this(new RegexCaptureParserConfig());
    }

    public RegexCaptureParser(RegexCaptureParserConfig config) {
        this.config = config;
        this.captureMap = new HashMap<>();
        for (Map.Entry<String, String> e : config.getCaptureMap().entrySet()) {
            this.captureMap.put(e.getKey(), Pattern.compile(e.getValue()));
        }
        this.matchMap = new HashMap<>();
        for (Map.Entry<String, String> e : config.getMatchMap().entrySet()) {
            this.matchMap.put(e.getKey(), Pattern.compile(e.getValue()));
        }
        this.writeContent = config.isWriteContent();
    }

    public RegexCaptureParser(JsonConfig jsonConfig) {
        this(ConfigDeserializer.buildConfig(jsonConfig, RegexCaptureParserConfig.class));
    }

    @Override
    public Set<MediaType> getSupportedTypes(ParseContext context) {
        return SUPPORTED_TYPES;
    }

    public RegexCaptureParserConfig getConfig() {
        return config;
    }

    @Override
    public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata,
                      ParseContext context) throws IOException, SAXException, TikaException {
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(tis,
                StandardCharsets.UTF_8))) {
            String line = reader.readLine();
            Map<String, Matcher> localCaptureMap = new HashMap();
            for (Map.Entry<String, Pattern> e : captureMap.entrySet()) {
                localCaptureMap.put(e.getKey(), e.getValue().matcher(""));
            }
            Map<String, Matcher> localMatchMap = new HashMap<>();
            for (Map.Entry<String, Pattern> e : matchMap.entrySet()) {
                localMatchMap.put(e.getKey(), e.getValue().matcher(""));
            }

            Map<String, Set<String>> keyVals = new HashMap<>();
            while (line != null) {
                for (Map.Entry<String, Matcher> e : localCaptureMap.entrySet()) {
                    Matcher m = e.getValue();
                    if (m.reset(line).find()) {
                        String val = m.group(1);
                        Set<String> vals = keyVals.get(e.getKey());
                        if (vals == null) {
                            vals = new LinkedHashSet<>();
                            keyVals.put(e.getKey(), vals);
                        }
                        vals.add(val);
                    }
                }
                for (Map.Entry<String, Matcher> e : localMatchMap.entrySet()) {
                    if (e.getValue().reset(line).find()) {
                        metadata.set(e.getKey(), "true");
                    }
                }
                if (writeContent) {
                    char[] chars = line.toCharArray();
                    handler.characters(chars, 0, chars.length);
                }
                line = reader.readLine();
            }
            for (Map.Entry<String, Set<String>> e : keyVals.entrySet()) {
                for (String val : e.getValue()) {
                    metadata.add(e.getKey(), val);
                }
            }
        }
    }
}