PDF.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.metadata;
/**
* PDF properties collection.
*
* @since Apache Tika 1.14
*/
public interface PDF {
String PDF_PREFIX = "pdf" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
String PDFA_PREFIX = "pdfa" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
String PDFAID_PREFIX = "pdfaid" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
/**
* Number of %%EOF as extracted by the StartXRefScanner. See
* that class for limitations.
*
* This includes the final %%EOF, which may or may not be at the literal
* end of the file. This does not include an %%EOF
* if the startxref=0, as would happen in a dummy %%EOF in a linearized PDF.
*/
Property EOF_OFFSETS = Property.externalRealSeq(PDF_PREFIX + "eofOffsets");
/**
* Prefix to be used for properties that record what was stored
* in the docinfo section (as opposed to XMP)
*/
String PDF_DOC_INFO_PREFIX =
PDF_PREFIX + "docinfo" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
String PDF_DOC_INFO_CUSTOM_PREFIX =
PDF_DOC_INFO_PREFIX + "custom" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
Property DOC_INFO_CREATED = Property.internalDate(PDF_DOC_INFO_PREFIX + "created");
Property DOC_INFO_CREATOR = Property.internalText(PDF_DOC_INFO_PREFIX + "creator");
Property DOC_INFO_CREATOR_TOOL = Property.internalText(PDF_DOC_INFO_PREFIX + "creator_tool");
Property DOC_INFO_MODIFICATION_DATE = Property.internalDate(PDF_DOC_INFO_PREFIX + "modified");
Property DOC_INFO_KEY_WORDS = Property.internalText(PDF_DOC_INFO_PREFIX + "keywords");
Property DOC_INFO_PRODUCER = Property.internalText(PDF_DOC_INFO_PREFIX + "producer");
Property DOC_INFO_SUBJECT = Property.internalText(PDF_DOC_INFO_PREFIX + "subject");
Property DOC_INFO_TITLE = Property.internalText(PDF_DOC_INFO_PREFIX + "title");
Property DOC_INFO_TRAPPED = Property.internalText(PDF_DOC_INFO_PREFIX + "trapped");
Property PDF_VERSION = Property.internalRational(PDF_PREFIX + "PDFVersion");
Property PDFA_VERSION = Property.internalRational(PDFA_PREFIX + "PDFVersion");
Property PDF_EXTENSION_VERSION = Property.internalRational(PDF_PREFIX + "PDFExtensionVersion");
Property PDFAID_CONFORMANCE = Property.internalText(PDFAID_PREFIX + "conformance");
Property PDFAID_PART = Property.internalInteger(PDFAID_PREFIX + "part");
Property PDFUAID_PART = Property.internalInteger("pdfuaid:part");
Property PDFVT_VERSION = Property.internalText("pdfvt:version");
Property PDFVT_MODIFIED = Property.internalDate("pdfvt:modified");
Property PDFXID_VERSION = Property.internalText("pdfxid:version");
Property PDFX_VERSION = Property.internalText("pdfx:version");
Property PDFX_CONFORMANCE = Property.internalText("pdfx:conformance");
Property ILLUSTRATOR_TYPE = Property.internalText("pdf:illustrator:type");
Property IS_ENCRYPTED = Property.internalBoolean(PDF_PREFIX + "encrypted");
Property PRODUCER = Property.internalText(PDF_PREFIX + "producer");
/**
* This specifies where an action or destination would be found/triggered
* in the document: on document open, before close, etc.
*
* This is included in the embedded document (js only for now?), not the container PDF.
*/
Property ACTION_TRIGGER = Property.internalText(PDF_PREFIX + "actionTrigger");
/**
* This is a list of all action or destination triggers contained
* within a given PDF.
*/
Property ACTION_TRIGGERS = Property.internalTextBag(PDF_PREFIX + "actionTriggers");
Property ACTION_TYPES = Property.internalTextBag(PDF_PREFIX + "actionTypes");
Property CHARACTERS_PER_PAGE = Property.internalIntegerSequence(PDF_PREFIX + "charsPerPage");
Property UNMAPPED_UNICODE_CHARS_PER_PAGE =
Property.internalIntegerSequence(PDF_PREFIX + "unmappedUnicodeCharsPerPage");
Property TOTAL_UNMAPPED_UNICODE_CHARS =
Property.internalInteger(PDF_PREFIX + "totalUnmappedUnicodeChars");
Property OVERALL_PERCENTAGE_UNMAPPED_UNICODE_CHARS =
Property.internalReal(PDF_PREFIX + "overallPercentageUnmappedUnicodeChars");
/**
* Contains at least one damaged font for at least one character
*/
Property CONTAINS_DAMAGED_FONT =
Property.internalBoolean(PDF_PREFIX + "containsDamagedFont");
/**
* Contains at least one font that is not embedded
*/
Property CONTAINS_NON_EMBEDDED_FONT =
Property.internalBoolean(PDF_PREFIX + "containsNonEmbeddedFont");
/**
* Has XFA
*/
Property HAS_XFA = Property.internalBoolean(PDF_PREFIX + "hasXFA");
/**
* Has XMP, whether or not it is valid
*/
Property HAS_XMP = Property.internalBoolean(PDF_PREFIX + "hasXMP");
/**
* If xmp is extracted by, e.g. the XMLProfiler, where did it come from?
* The document's document catalog or a specific page...or?
*/
Property XMP_LOCATION = Property.internalText(PDF_PREFIX + "xmpLocation");
/**
* Has > 0 AcroForm fields
*/
Property HAS_ACROFORM_FIELDS = Property.internalBoolean(PDF_PREFIX + "hasAcroFormFields");
Property HAS_MARKED_CONTENT = Property.internalBoolean(PDF_PREFIX + "hasMarkedContent");
/**
* Has a collection element in the root. If true, this is likely a PDF Portfolio.
*/
Property HAS_COLLECTION = Property.internalBoolean(PDF_PREFIX + "hasCollection");
Property EMBEDDED_FILE_DESCRIPTION = Property.externalText(PDF_PREFIX +
"embeddedFileDescription");
/**
* If the file came from an annotation and there was a type
*/
Property EMBEDDED_FILE_ANNOTATION_TYPE = Property.internalText(PDF_PREFIX +
"embeddedFileAnnotationType");
/**
* literal string from the PDEmbeddedFile#getSubtype(), should be what the PDF
* alleges is the embedded file's mime type
*/
Property EMBEDDED_FILE_SUBTYPE = Property.internalText(PDF_PREFIX +
"embeddedFileSubtype");
/**
* If the PDF has an annotation of type 3D
*/
Property HAS_3D = Property.internalBoolean(PDF_PREFIX + "has3D");
Property ANNOTATION_TYPES = Property.internalTextBag(PDF_PREFIX + "annotationTypes");
Property ANNOTATION_SUBTYPES = Property.internalTextBag(PDF_PREFIX + "annotationSubtypes");
/**
* Number of 3D annotations a PDF contains. This makes {@link PDF#HAS_3D} redundant.
*/
Property NUM_3D_ANNOTATIONS = Property.internalInteger(PDF_PREFIX + "num3DAnnotations");
Property ASSOCIATED_FILE_RELATIONSHIP = Property.internalText(PDF_PREFIX +
"associatedFileRelationship");
/**
* This is a zero-based number for incremental updates within a PDF -- 0 is the first
* update, 1 is the second, etc. The final version of the PDF (e.g. the last update)
* does not have an incremental update number.
*
* This value is populated with the parse incremental updates feature is selected
* in the PDFParser.
*/
Property INCREMENTAL_UPDATE_NUMBER =
Property.composite(Property.internalInteger(PDF_PREFIX + "incrementalUpdateNumber"),
new Property[]{ TikaCoreProperties.VERSION_NUMBER });
/**
* Incremental updates as extracted by the StartXRefScanner. See
* that class for limitations.
*/
Property PDF_INCREMENTAL_UPDATE_COUNT =
Property.composite( Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"),
new Property[]{ TikaCoreProperties.VERSION_COUNT });
/**
* This counts the number of pages that would have been OCR'd or were OCR'd depending
* on the OCR settings. If NO_OCR is selected, this will
*/
Property OCR_PAGE_COUNT = Property.externalInteger(PDF_PREFIX + "ocrPageCount");
/**
* When javascript is stored in the names tree, there's a name associated with that script.
* This is that name. When javascript is stored in an action, there is no name, and this
* metadata will not be populated.
*/
Property JS_NAME = Property.internalText(PDF_PREFIX + "jsName");
}