MatroskaDetector.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.detect;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
/**
* Detector for Matroska (MKV and WEBM) files based on the EBML header.
*/
@TikaComponent
public class MatroskaDetector implements Detector {
/** For serialization compatibility. */
private static final long serialVersionUID = 1L;
private static final MediaType MATROSKA =
MediaType.application("x-matroska");
private static final MediaType WEBM =
MediaType.video("webm");
private static final byte[] EBML_HEADER =
new byte[]{0x1A, 0x45, (byte) 0xDF, (byte) 0xA3};
/**
* Detects the media type of the input stream by inspecting EBML headers.
*
* @param tis the input stream
* @param metadata the metadata to populate
* @return detected MediaType (WEBM, Matroska, or OCTET_STREAM)
* @throws IOException if an I/O error occurs
*/
@Override
public MediaType detect(TikaInputStream tis, Metadata metadata, ParseContext parseContext) throws IOException {
if (tis == null) {
return MediaType.OCTET_STREAM;
}
tis.mark(64);
byte[] header = new byte[64];
int bytesRead = -1;
try {
bytesRead = IOUtils.read(tis, header, 0, 64);
} finally {
tis.reset();
}
if (bytesRead < EBML_HEADER.length) {
return MediaType.OCTET_STREAM;
}
for (int i = 0; i < EBML_HEADER.length; i++) {
if (header[i] != EBML_HEADER[i]) {
return MediaType.OCTET_STREAM;
}
}
for (int i = 4; i < bytesRead - 4; i++) {
if (header[i] == 'w'
&& header[i + 1] == 'e'
&& header[i + 2] == 'b'
&& header[i + 3] == 'm') {
return WEBM;
}
if (header[i] == 'm'
&& header[i + 1] == 'a'
&& header[i + 2] == 't'
&& header[i + 3] == 'r') {
return MATROSKA;
}
}
return MediaType.OCTET_STREAM;
}
}