MSOneStorePackage.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft.onenote.fsshttpb;

import static org.apache.tika.parser.microsoft.onenote.OneNoteParser.ONE_NOTE_PREFIX;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.time.Instant;
import java.time.LocalDateTime;
import java.time.Month;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.xml.sax.SAXException;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.microsoft.onenote.OneNotePropertyEnum;
import org.apache.tika.parser.microsoft.onenote.OneNoteTreeWalkerOptions;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.property.EightBytesOfData;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.property.FourBytesOfData;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.property.IProperty;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.property.PrtFourBytesOfLengthFollowedByData;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.CellManifestDataElementData;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.PropertySet;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.RevisionManifestDataElementData;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.RevisionStoreObject;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.RevisionStoreObjectGroup;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.StorageIndexCellMapping;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.StorageIndexDataElementData;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.StorageIndexRevisionMapping;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.StorageManifestDataElementData;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.basic.CellID;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.basic.ExGuid;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.basic.HeaderCell;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.basic.PropertyID;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.basic.PropertyType;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.unsigned.Unsigned;
import org.apache.tika.parser.microsoft.onenote.fsshttpb.util.BitConverter;
import org.apache.tika.sax.XHTMLContentHandler;

public class MSOneStorePackage {
    /**
     * See spec MS-ONE - 2.3.1 - TIME32 - epoch of jan 1 1980 UTC.
     * So we create this offset used to calculate number of seconds between this and the Instant
     * .EPOCH.
     */
    private static final long TIME32_EPOCH_DIFF_1980;
    /**
     * See spec MS-DTYP - 2.3.3 - DATETIME dates are based on epoch of jan 1 1601 UTC.
     * So we create this offset used to calculate number of seconds between this and the Instant
     * .EPOCH.
     */
    private static final long DATETIME_EPOCH_DIFF_1601;
    private static final Pattern HYPERLINK_PATTERN =
            Pattern.compile("\uFDDFHYPERLINK\\s+\"([^\"]+)\"([^\"]+)$");
    private static final String P = "p";

    static {
        LocalDateTime time32Epoch1980 = LocalDateTime.of(1980, Month.JANUARY, 1, 0, 0);
        Instant instant = time32Epoch1980.atZone(ZoneOffset.UTC).toInstant();
        TIME32_EPOCH_DIFF_1980 = (instant.toEpochMilli() - Instant.EPOCH.toEpochMilli()) / 1000;
    }

    static {
        LocalDateTime time32Epoch1601 = LocalDateTime.of(1601, Month.JANUARY, 1, 0, 0);
        Instant instant = time32Epoch1601.atZone(ZoneOffset.UTC).toInstant();
        DATETIME_EPOCH_DIFF_1601 = (instant.toEpochMilli() - Instant.EPOCH.toEpochMilli()) / 1000;
    }

    private final Set<String> authors = new HashSet<>();
    private final Set<String> mostRecentAuthors = new HashSet<>();
    private final Set<String> originalAuthors = new HashSet<>();
    public StorageIndexDataElementData storageIndex;
    public StorageManifestDataElementData storageManifest;
    public CellManifestDataElementData headerCellCellManifest;
    public RevisionManifestDataElementData headerCellRevisionManifest;
    public List<RevisionManifestDataElementData> revisionManifests;
    public List<CellManifestDataElementData> cellManifests;
    public HeaderCell headerCell;
    public List<RevisionStoreObjectGroup> dataRoot;
    public List<RevisionStoreObjectGroup> OtherFileNodeList;
    private boolean mostRecentAuthorProp = false;
    private boolean originalAuthorProp = false;
    private Instant lastModifiedTimestamp = Instant.MIN;
    private long creationTimestamp = Long.MAX_VALUE;
    private long lastModified = Long.MIN_VALUE;

    public MSOneStorePackage() {
        this.revisionManifests = new ArrayList<>();
        this.cellManifests = new ArrayList<>();
        this.OtherFileNodeList = new ArrayList<>();
    }

    /**
     * This method is used to find the Storage Index Cell Mapping matches the Cell ID.
     *
     * @param cellID Specify the Cell ID.
     * @return Return the specific Storage Index Cell Mapping.
     */
    public StorageIndexCellMapping findStorageIndexCellMapping(CellID cellID) {
        StorageIndexCellMapping storageIndexCellMapping = null;
        if (this.storageIndex != null) {
            storageIndexCellMapping = this.storageIndex.storageIndexCellMappingList.stream()
                    .filter(s -> s.cellID.equals(cellID)).findFirst()
                    .orElse(new StorageIndexCellMapping());
        }
        return storageIndexCellMapping;
    }

    /**
     * This method is used to find the Storage Index Revision Mapping that matches the Revision Mapping Extended GUID.
     *
     * @param revisionExtendedGUID Specify the Revision Mapping Extended GUID.
     * @return Return the instance of Storage Index Revision Mapping.
     */
    public StorageIndexRevisionMapping findStorageIndexRevisionMapping(
            ExGuid revisionExtendedGUID) {
        StorageIndexRevisionMapping instance = null;
        if (this.storageIndex != null) {
            instance = this.storageIndex.storageIndexRevisionMappingList.stream()
                    .filter(r -> r.revisionExGuid.equals(revisionExtendedGUID)).findFirst()
                    .orElse(new StorageIndexRevisionMapping());
        }

        return instance;
    }

    /**
     * Is this property a binary property?
     *
     * @param property The property.
     * @return Is it binary?
     */
    private boolean propertyIsBinary(OneNotePropertyEnum property) {
        return property == OneNotePropertyEnum.RgOutlineIndentDistance ||
                property == OneNotePropertyEnum.NotebookManagementEntityGuid ||
                property == OneNotePropertyEnum.RichEditTextUnicode;
    }

    public void walkTree(OneNoteTreeWalkerOptions options, Metadata metadata,
                         XHTMLContentHandler xhtml)
            throws SAXException, TikaException, IOException {
        for (RevisionStoreObjectGroup revisionStoreObjectGroup : OtherFileNodeList) {
            for (RevisionStoreObject revisionStoreObject : revisionStoreObjectGroup.objects) {
                PropertySet propertySet =
                        revisionStoreObject.propertySet.objectSpaceObjectPropSet.body;
                for (int i = 0; i < propertySet.rgData.size(); ++i) {
                    IProperty property = propertySet.rgData.get(i);
                    PropertyID propertyID = propertySet.rgPrids[i];
                    PropertyType propertyType = PropertyType.fromIntVal(propertyID.type);
                    OneNotePropertyEnum oneNotePropertyEnum =
                            OneNotePropertyEnum.of(Unsigned.uint(propertyID.value).longValue());
                    if (oneNotePropertyEnum == OneNotePropertyEnum.LastModifiedTimeStamp) {
                        long fullval = getScalar(property);
                        Instant instant = Instant.ofEpochSecond(
                                fullval / 10000000 + DATETIME_EPOCH_DIFF_1601);
                        if (instant.isAfter(lastModifiedTimestamp)) {
                            lastModifiedTimestamp = instant;
                        }
                        metadata.set(ONE_NOTE_PREFIX + "lastModifiedTimestamp",
                                String.valueOf(lastModifiedTimestamp.toEpochMilli()));
                    } else if (oneNotePropertyEnum == OneNotePropertyEnum.CreationTimeStamp) {
                        // add the TIME32_EPOCH_DIFF_1980 because OneNote TIME32 epoch time is per 1980, not
                        // 1970
                        long scalar = getScalar(property);
                        long creationTs = scalar + TIME32_EPOCH_DIFF_1980;
                        if (creationTs < creationTimestamp) {
                            creationTimestamp = creationTs;
                        }
                        metadata.set(ONE_NOTE_PREFIX + "creationTimestamp", String.valueOf(creationTimestamp));
                    } else if (oneNotePropertyEnum == OneNotePropertyEnum.LastModifiedTime) {
                        // add the TIME32_EPOCH_DIFF_1980 because OneNote TIME32 epoch time is per 1980, not
                        // 1970
                        long scalar = getScalar(property);
                        long lastMod = scalar + TIME32_EPOCH_DIFF_1980;
                        if (lastMod > lastModified) {
                            lastModified = lastMod;
                        }
                        metadata.set(TikaCoreProperties.MODIFIED, String.valueOf(lastModified));
                    } else if (oneNotePropertyEnum == OneNotePropertyEnum.Author) {
                        String author =
                                new String(((PrtFourBytesOfLengthFollowedByData) property).data,
                                        StandardCharsets.UTF_8);
                        if (mostRecentAuthorProp) {
                            mostRecentAuthors.add(author);
                        } else if (originalAuthorProp) {
                            originalAuthors.add(author);
                        } else {
                            authors.add(author);
                        }
                    } else if (oneNotePropertyEnum == OneNotePropertyEnum.AuthorMostRecent) {
                        mostRecentAuthorProp = true;
                    } else if (oneNotePropertyEnum == OneNotePropertyEnum.AuthorOriginal) {
                        originalAuthorProp = true;
                    } else if (propertyType == PropertyType.FourBytesOfLengthFollowedByData) {
                        boolean isBinary = propertyIsBinary(oneNotePropertyEnum);
                        PrtFourBytesOfLengthFollowedByData dataProperty =
                                (PrtFourBytesOfLengthFollowedByData) property;
                        if ((dataProperty.data.length & 1) == 0 &&
                                oneNotePropertyEnum != OneNotePropertyEnum.TextExtendedAscii &&
                                !isBinary) {
                            if (options.getUtf16PropertiesToPrint().contains(oneNotePropertyEnum)) {
                                xhtml.startElement(P);
                                xhtml.characters(
                                        new String(dataProperty.data, StandardCharsets.UTF_16LE));
                                xhtml.endElement(P);
                            }
                        } else if (oneNotePropertyEnum == OneNotePropertyEnum.TextExtendedAscii) {
                            xhtml.startElement(P);
                            xhtml.characters(
                                    new String(dataProperty.data, StandardCharsets.US_ASCII));
                            xhtml.endElement(P);
                        } else if (!isBinary) {
                            if (options.getUtf16PropertiesToPrint().contains(oneNotePropertyEnum)) {
                                xhtml.startElement(P);
                                xhtml.characters(
                                        new String(dataProperty.data, StandardCharsets.UTF_16LE));
                                xhtml.endElement(P);
                            }
                        } else {
                            if (oneNotePropertyEnum == OneNotePropertyEnum.RichEditTextUnicode) {
                                handleRichEditTextUnicode(dataProperty.data, xhtml);
                            } else {
                                //TODO -- these seem to be somewhat broken font files and other
                                //odds and ends...what are they and how should we process them?
                                //handleEmbedded(content.size());
                            }
                        }
                    }
                }
            }
        }
        if (!authors.isEmpty()) {
            metadata.set(TikaCoreProperties.CREATOR, authors.toArray(new String[]{}));
        }
        if (!mostRecentAuthors.isEmpty()) {
            metadata.set(Property.externalTextBag(ONE_NOTE_PREFIX + "mostRecentAuthors"),
                    mostRecentAuthors.toArray(new String[]{}));
        }
        if (!originalAuthors.isEmpty()) {
            metadata.set(Property.externalTextBag(ONE_NOTE_PREFIX + "originalAuthors"),
                    originalAuthors.toArray(new String[]{}));
        }
    }


    private void handleRichEditTextUnicode(byte[] arr, XHTMLContentHandler xhtml)
            throws SAXException, IOException, TikaException {
        // look for the first null
        int firstNull = 0;
        for (int i = 0; i < arr.length - 1; i += 2) {
            if (arr[i] == 0 && arr[i + 1] == 0) {
                firstNull = Math.max(i, 0);
                break;
            }
        }

        if (firstNull == 0) {
            return;
        }
        String txt = new String(arr, 0, firstNull, StandardCharsets.UTF_16LE);
        Matcher m = HYPERLINK_PATTERN.matcher(txt);
        if (m.find()) {
            xhtml.startElement("a", "href", m.group(1));
            xhtml.characters(m.group(2));
            xhtml.endElement("a");
        } else {
            xhtml.startElement(P);
            xhtml.characters(txt);
            xhtml.endElement(P);
        }
    }

    private long getScalar(IProperty property) throws TikaException, IOException {
        if (property instanceof FourBytesOfData) {
            FourBytesOfData fourBytesOfDataProp = (FourBytesOfData) property;
            return BitConverter.toUInt32(fourBytesOfDataProp.data, 0);
        } else if (property instanceof EightBytesOfData) {
            EightBytesOfData fourBytesOfDataProp = (EightBytesOfData) property;
            return BitConverter.toInt64(fourBytesOfDataProp.data, 0);
        }
        throw new TikaException("Could not parse scalar of type " + property.getClass());
    }
}