UniversalExecutableParser.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.executable;

import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.config.TikaComponent;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.UnsupportedFormatException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;

/**
 * Parser for universal executable files.
 */
@TikaComponent
public class UniversalExecutableParser implements Parser {
    private static final long serialVersionUID = 1L;

    private static final Set<MediaType> SUPPORTED_TYPES =
            Collections.singleton(MediaType.application("x-mach-o-universal"));

    private static final int MAX_ARCHS_COUNT = 1000;
    private static final int MAX_ARCH_SIZE = 500_000_000;//arbitrary

    @Override
    public Set<MediaType> getSupportedTypes(ParseContext arg0) {
        return SUPPORTED_TYPES;
    }

    @Override
    public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata,
                      ParseContext context) throws IOException, SAXException, TikaException {

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context);
        xhtml.startDocument();

        EmbeddedDocumentExtractor extractor =
                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);

        byte[] first4 = new byte[4];
        IOUtils.readFully(tis, first4);

        if ((first4[0] == (byte) 0xBF || first4[0] == (byte) 0xBE) &&
                first4[1] == (byte) 0xBA && first4[2] == (byte) 0xFE && first4[3] == (byte) 0xCA) {
            parseMachO(xhtml, extractor, metadata, tis, first4, context);
        } else if (first4[0] == (byte) 0xCA && first4[1] == (byte) 0xFE &&
                first4[2] == (byte) 0xBA &&
                (first4[3] == (byte) 0xBF || first4[3] == (byte) 0xBE)) {
            parseMachO(xhtml, extractor, metadata, tis, first4, context);
        } else {
            throw new UnsupportedFormatException("Not a universal executable file");
        }

        xhtml.endDocument();
    }

    /**
     * Parses a Mach-O Universal file
     */
    public void parseMachO(XHTMLContentHandler xhtml, EmbeddedDocumentExtractor extractor,
                           Metadata metadata, InputStream tis,
                           byte[] first4, ParseContext context)
            throws IOException, SAXException, TikaException {
        var currentOffset = (long) first4.length;
        var isLE = first4[3] == (byte) 0xCA;
        var is64 = first4[isLE ? 0 : 3] == (byte) 0xBF;
        int archStructSize = 4 /* cputype */ + 4 /* cpusubtype */ + (is64
                ? 8 /* offset */ + 8 /* size */ + 4 /* align */ + 4 /* reserved */
                : 4 /* offset */ + 4 /* size */ + 4 /* align */);

        int archsCount = isLE ? EndianUtils.readIntLE(tis) : EndianUtils.readIntBE(tis);
        if (archsCount < 1) {
            throw new TikaException("Invalid number of architectures: " + archsCount);
        }
        if (archsCount > MAX_ARCHS_COUNT) {
            throw new TikaException("Number of architectures=" + archsCount + " greater than max allowed=" + MAX_ARCHS_COUNT);
        }

        currentOffset += 4;

        long archsSize = (long) archsCount * archStructSize;

        var unsortedOffsets = false;
        var offsetAndSizePerArch = new Pair[archsCount];
        for (int archIndex = 0; archIndex < archsCount; archIndex++) {
            IOUtils.skipFully(tis, 8);

            long offset = is64
                    ? (isLE ? EndianUtils.readLongLE(tis) : EndianUtils.readLongBE(tis))
                    : (isLE ? EndianUtils.readIntLE(tis) : EndianUtils.readIntBE(tis));
            if (offset < 4 + 4 + archsSize) {
                throw new TikaException("Invalid offset: " + offset);
            }
            if (!unsortedOffsets && archIndex > 0 && offset < (long) offsetAndSizePerArch[archIndex - 1].getLeft()) {
                unsortedOffsets = true;
            }
            long size = is64
                    ? (isLE ? EndianUtils.readLongLE(tis) : EndianUtils.readLongBE(tis))
                    : (isLE ? EndianUtils.readIntLE(tis) : EndianUtils.readIntBE(tis));

            if (size < 0 || size > MAX_ARCH_SIZE) {
                throw new TikaException("Arch size=" + size + " must be > 0 and < " + MAX_ARCH_SIZE);
            }
            offsetAndSizePerArch[archIndex] = Pair.of(offset, size);

            if (is64) {
                IOUtils.skipFully(tis, 8);
            } else {
                IOUtils.skipFully(tis, 4);
            }

            currentOffset += archStructSize;
        }
        if (unsortedOffsets) {
            Arrays.sort(offsetAndSizePerArch, Comparator.comparingLong(entry -> (long) entry.getLeft()));
        }

        for (int archIndex = 0; archIndex < archsCount; archIndex++) {
            long skipUntilStart = (long)offsetAndSizePerArch[archIndex].getLeft() - currentOffset;
            IOUtils.skipFully(tis, skipUntilStart);
            currentOffset += skipUntilStart;
            long sz = (long)offsetAndSizePerArch[archIndex].getRight();
            //we bounds checked this above.
            byte[] perArchMachO = new byte[(int)sz];
            IOUtils.readFully(tis, perArchMachO);
            currentOffset += perArchMachO.length;

            var perArchMetadata = Metadata.newInstance(context);
            var tikaInputStream = TikaInputStream.get(perArchMachO, perArchMetadata);
            if (extractor.shouldParseEmbedded(perArchMetadata)) {
                extractor.parseEmbedded(tikaInputStream, xhtml, perArchMetadata, context, true);
            }
        }
    }

}