BPListDetector.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.detect.apple;

import java.io.IOException;
import java.text.ParseException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import javax.xml.parsers.ParserConfigurationException;

import com.dd.plist.NSDictionary;
import com.dd.plist.NSObject;
import com.dd.plist.PropertyListFormatException;
import com.dd.plist.PropertyListParser;
import org.apache.commons.io.IOUtils;
import org.xml.sax.SAXException;

import org.apache.tika.config.TikaComponent;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;

/**
 * Detector for BPList with utility functions for PList.
 * <p>
 * Without significant refactoring, this can't easily work as a true
 * detector on plist subtypes.  Rather, for now, we require the file to be
 * parsed and then the parser adds the subtype for xml-based plists.
 *
 * @since 1.25
 */
@TikaComponent
public class BPListDetector implements Detector {

    //xml versions
    public static MediaType MEMGRAPH = MediaType.application("x-plist-memgraph");
    public static MediaType WEBARCHIVE = MediaType.application("x-plist-webarchive");
    public static MediaType PLIST = MediaType.application("x-plist");
    public static MediaType ITUNES = MediaType.application("x-plist-itunes");


    //binary versions
    public static MediaType BMEMGRAPH = MediaType.application("x-bplist-memgraph");
    public static MediaType BWEBARCHIVE = MediaType.application("x-bplist-webarchive");
    public static MediaType BPLIST = MediaType.application("x-bplist");
    public static MediaType BITUNES = MediaType.application("x-bplist-itunes");

    private static Map<MediaType, MediaType> BINARY_TO_XML = new HashMap<>();

    static {
        BINARY_TO_XML.put(BMEMGRAPH, MEMGRAPH);
        BINARY_TO_XML.put(BWEBARCHIVE, WEBARCHIVE);
        BINARY_TO_XML.put(BPLIST, PLIST);
        BINARY_TO_XML.put(BITUNES, ITUNES);
    }

    public static MediaType detectOnKeys(Set<String> keySet) {
        if (keySet.contains("nodes") && keySet.contains("edges") &&
                keySet.contains("graphEncodingVersion")) {
            return BMEMGRAPH;
        } else if (keySet.contains(
                "WebMainResource")) { //&& keySet.contains ("WebSubresources") should we require
            // this?
            return BWEBARCHIVE;
        } else if (keySet.contains("Playlists") && keySet.contains("Tracks") &&
                keySet.contains("Music Folder")) {
            return BITUNES;
        } //if it contains $archiver and $objects, it is a bplist inside a webarchive
        return BPLIST;
    }

    public static MediaType detectXMLOnKeys(Set<String> keySet) {
        return BINARY_TO_XML.get(detectOnKeys(keySet));
    }

    /**
     * @param tis      input stream must support reset
     * @param metadata input metadata for the document
     * @return
     * @throws IOException
     */
    @Override
    public MediaType detect(TikaInputStream tis, Metadata metadata, ParseContext parseContext) throws IOException {
        if (tis == null) {
            return MediaType.OCTET_STREAM;
        }
        tis.mark(8);
        byte[] bytes = new byte[8];

        try {
            int read = IOUtils.read(tis, bytes);
            if (read < 6) {
                return MediaType.OCTET_STREAM;
            }
        } catch (IOException e) {
            return MediaType.OCTET_STREAM;
        } finally {
            tis.reset();
        }

        int i = 0;
        if (bytes[i++] != 'b' || bytes[i++] != 'p' || bytes[i++] != 'l' || bytes[i++] != 'i' ||
                bytes[i++] != 's' || bytes[i++] != 't') {
            return MediaType.OCTET_STREAM;
        }
        //TODO: extract the version with the next two bytes if they were read
        NSObject rootObj = null;
        try {
            if (tis.hasFile()) {
                rootObj = PropertyListParser.parse(tis.getFile());
            } else {
                rootObj = PropertyListParser.parse(tis);
            }
            tis.setOpenContainer(rootObj);
        } catch (PropertyListFormatException | ParseException |
                ParserConfigurationException | SAXException e) {
            throw new IOException("problem parsing root", e);
        }
        if (rootObj instanceof NSDictionary) {
            return detectOnKeys(((NSDictionary) rootObj).getHashMap().keySet());
        }
        return BPLIST;
    }
}