OggDetector.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.detect.ogg;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.gagravarr.ogg.OggFile;
import org.gagravarr.ogg.OggPacket;
import org.gagravarr.ogg.OggPacketReader;
import org.gagravarr.ogg.OggStreamIdentifier;
import org.gagravarr.ogg.OggStreamIdentifier.OggStreamType;
import org.gagravarr.skeleton.SkeletonPacket;
import org.gagravarr.skeleton.SkeletonPacketFactory;

import org.apache.tika.config.TikaComponent;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;

/**
 * Detector for identifying specific file types stored
 * within an Ogg container.
 * Xiph provide a fairly unhelpful guide to mimetypes at
 * https://wiki.xiph.org/index.php/MIME_Types_and_File_Extensions
 * but we try to use more specific ones, as given by the Tika
 * mimetypes xml file.
 */
@TikaComponent
public class OggDetector implements Detector {
    private static final long serialVersionUID = 591382028699008553L;

    public static final MediaType OGG_GENERAL = MediaType.application("ogg");
    public static final MediaType OGG_AUDIO = MediaType.audio("ogg");
    public static final MediaType OGG_VIDEO = MediaType.video("ogg");

    @Override
    public MediaType detect(TikaInputStream tis, Metadata metadata, ParseContext parseContext)
            throws IOException {
        // Check if we have access to the document
        if (tis == null) {
            return MediaType.OCTET_STREAM;
        }

        // Check if the document starts with the OGG header
        tis.mark(4);
        try {
            if (tis.read() != (byte)'O' || tis.read() != (byte)'g'
                    || tis.read() != (byte)'g' || tis.read() != (byte)'S') {
                return MediaType.OCTET_STREAM;
            }
        } finally {
            tis.reset();
        }

        // We could potentially need to go a long way through the
        // file in order to figure out what it is
        tis.mark((int)tis.getLength() + 1);

        try {
            // Open the Ogg file - underlying stream stays open as detecting only
            @SuppressWarnings("resource")
            OggFile ogg = new OggFile(tis);

            // The things we'll want to keep track of
            int totalStreams = 0;
            List<Integer> sids = new ArrayList<>();
            Map<OggStreamType, Integer> streams = new HashMap<>();
            Map<Integer, List<SkeletonPacket>> skeletonStreams = new HashMap<>();

            // Check the streams in turn
            OggPacketReader r = ogg.getPacketReader();
            OggPacket p;
            Integer sid;
            try {
                while ((p = r.getNextPacket()) != null) {
                    if (p.isBeginningOfStream()) {
                        totalStreams++;
                        sids.add(p.getSid());

                        OggStreamType type = OggStreamIdentifier.identifyType(p);

                        // If it's a Skeleton stream, start tracking
                        if (type == OggStreamIdentifier.SKELETON) {
                            List<SkeletonPacket> sp = new ArrayList<>();
                            sp.add(SkeletonPacketFactory.create(p));
                            skeletonStreams.put(p.getSid(), sp);
                        }

                        // Increment the per-type count
                        Integer num = streams.get(type);
                        if (num == null) {
                            num = 1;
                        } else {
                            num = num + 1;
                        }
                        streams.put(type, num);
                    } else {
                        sid = p.getSid();

                        // Is it a skeleton stream?
                        if (skeletonStreams.containsKey(sid)) {
                            skeletonStreams.get(sid).add(SkeletonPacketFactory.create(p));
                        }
                    }
                }
            } catch (UnsupportedOperationException e) {
                // Silently swallow this problem with the file,
                // and just say "not ours"
                return MediaType.OCTET_STREAM;
            }

            // Can we identify what it really is?
            // First up, is it a simple single stream file?
            if (totalStreams == 1) {
                OggStreamType type = streams.keySet().iterator().next();
                return toMediaType(type);
            }

            // Is it one with a single non-metadata stream?
            int nonMetadataStreams = 0;
            for (OggStreamType type : streams.keySet()) {
                if (type.kind != OggStreamType.Kind.METADATA) {
                    nonMetadataStreams += streams.get(type);
                }
            }

            if (nonMetadataStreams == 0) {
                // Pure metadata, report as general ogg
                return OGG_GENERAL;
            }

            if (nonMetadataStreams == 1) {
                // Report as the non metadata kind
                for (OggStreamType type : streams.keySet()) {
                    if (type.kind != OggStreamType.Kind.METADATA) {
                        return toMediaType(type);
                    }
                }
            }

            // Is it a single video stream, with zero or more audio streams?
            int videoCount = 0;
            int audioCount = 0;
            int kateCount = 0;
            Set<OggStreamType> audioTypes = new HashSet<>();
            Set<OggStreamType> videoTypes = new HashSet<>();
            for (OggStreamType type : streams.keySet()) {
                if (type.kind == OggStreamType.Kind.VIDEO) {
                    videoCount += streams.get(type);
                    videoTypes.add(type);
                } else if (type.kind == OggStreamType.Kind.AUDIO) {
                    audioCount += streams.get(type);
                    audioTypes.add(type);
                } else if (type == OggStreamIdentifier.KATE) {
                    kateCount++;
                }
            }
            if (videoCount == 1) {
                // Report it as the video type, not the audio within that
                return toMediaType(videoTypes.iterator().next());
            }

            // Is it multiple audio streams, with no video?
            if (videoCount == 0 && audioCount > 1) {
                // Are they all the same audio kind?
                if (audioTypes.size() == 1) {
                    // All the same kind, report it as that
                    return toMediaType(audioTypes.iterator().next());
                } else {
                    // Mixture of audio types, report as general audio
                    return OGG_AUDIO;
                }
            }

            // Is it multiple video streams?
            if (videoCount > 1) {
                // Are they all the same video kind?
                if (videoTypes.size() == 1) {
                    // All the same kind, report it as that video
                    return toMediaType(videoTypes.iterator().next());
                } else {
                    // Mixture of video types, report as general video
                    return OGG_VIDEO;
                }
            }

            // Is it only Kate streams, but no video nor audio?
            if (kateCount > 0) {
                return toMediaType(OggStreamIdentifier.KATE);
            }

            // If we get here, then we can't work out what it is
        } finally {
            // Tidy up - reset the stream
            tis.reset();
        }

        // Couldn't determine a more specific type
        return OGG_GENERAL;
    }

    /**
     * Converts from vorbis-java type to Tika's type
     */
    protected static MediaType toMediaType(OggStreamType type) {
        if (type == OggStreamIdentifier.UNKNOWN) {
            // We don't have a specific type available to return
            return OGG_GENERAL;
        } else {
            // Say it's the specific type we found
            return MediaType.parse(type.mimetype);
        }
    }
}