TikaCoreProperties.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.metadata;
/**
* Contains a core set of basic Tika metadata properties, which all parsers
* will attempt to supply (where the file format permits). These are all
* defined in terms of other standard namespaces.
* <p>
* Users of Tika who wish to have consistent metadata across file formats
* can make use of these Properties, knowing that where present they will
* have consistent semantic meaning between different file formats. (No
* matter if one file format calls it Title, another Long-Title and another
* Long-Name, if they all mean the same thing as defined by
* {@link DublinCore#TITLE} then they will all be present as such)
* <p>
* For now, most of these properties are composite ones including the deprecated
* non-prefixed String properties from the Metadata class. In Tika 2.0, most
* of these will revert back to simple assignments.
*
* @since Apache Tika 1.2
*/
@SuppressWarnings("deprecation")
public interface TikaCoreProperties {
/**
* The common delimiter used between the namespace abbreviation and the property name
*/
String NAMESPACE_PREFIX_DELIMITER = ":";
/**
* Use this to prefix metadata properties that store information
* about the parsing process. Users should be able to distinguish
* between metadata that was contained within the document and
* metadata about the parsing process.
*/
String TIKA_META_PREFIX = "X-TIKA" + NAMESPACE_PREFIX_DELIMITER;
Property EMBEDDED_DEPTH = Property.internalInteger(TIKA_META_PREFIX + "embedded_depth");
/**
* This tracks the embedded file paths based on the name of embedded files
* where available.
* <p/>
* This field should be treated with great care and should NOT
* be used for creating a directory structure to write out attachments
* because: there may be path collisions or illegal characters or other mayhem.
* <p/>
* For a more robust path, see {@link TikaCoreProperties#EMBEDDED_ID_PATH}.
*/
Property EMBEDDED_RESOURCE_PATH =
Property.internalText(TIKA_META_PREFIX + "embedded_resource_path");
/**
* This is calculated in {@link org.apache.tika.sax.RecursiveParserWrapperHandler}.
* It differs from {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH} in that
* it is calculated at the end of the full parse of a file. {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH}
* is calculated during the parse, and, for some parsers, an embedded file's name isn't known until
* after its child files have been parsed.
* <p/>
* Note that the unknown file count may differ between {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH}
* because there should be fewer unknown files when this is calculated. More simply,
* there is no connection between "embedded-1" in this field and "embedded-1" in
* {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH}.
* <p/>
* This field should be treated with great care and should NOT
* be used for creating a directory structure to write out attachments
* because: there may be path collisions or illegal characters or other mayhem.
* <p/>
*
* For a more robust path, see {@link TikaCoreProperties#EMBEDDED_ID_PATH}.
*/
Property FINAL_EMBEDDED_RESOURCE_PATH =
Property.internalText(TIKA_META_PREFIX + "final_embedded_resource_path");
/**
* This tracks the embedded file paths based on the embedded file's
* {@link TikaCoreProperties#EMBEDDED_ID}.
*/
Property EMBEDDED_ID_PATH =
Property.internalText(TIKA_META_PREFIX + "embedded_id_path");
/**
* This is a 1-index counter for embedded files, used by the RecursiveParserWrapper
*/
Property EMBEDDED_ID =
Property.internalInteger(TIKA_META_PREFIX + "embedded_id");
Property PARSE_TIME_MILLIS = Property.internalText(TIKA_META_PREFIX + "parse_time_millis");
/**
* Simple class name of the content handler.
* @deprecated Use {@link #TIKA_CONTENT_HANDLER_TYPE} for the handler type enum value.
*/
@Deprecated
Property TIKA_CONTENT_HANDLER = Property.internalText(TIKA_META_PREFIX + "content_handler");
/**
* The handler type used to produce {@link #TIKA_CONTENT}.
* Value is the {@link org.apache.tika.sax.BasicContentHandlerFactory.HANDLER_TYPE}
* enum name (e.g. {@code TEXT}, {@code MARKDOWN}, {@code HTML}, {@code XML}).
*/
Property TIKA_CONTENT_HANDLER_TYPE =
Property.internalText(TIKA_META_PREFIX + "content_handler_type");
Property TIKA_CONTENT = Property.internalText(TIKA_META_PREFIX + "content");
/**
* JSON array of chunks (text segments with optional embedding vectors and locators).
* Used by inference parsers and metadata filters to attach chunked representations
* of document content for downstream indexing and semantic search.
*/
String TIKA_CHUNKS = "tika:chunks";
/**
* Use this to store parse exception information in the Metadata object.
*/
String TIKA_META_EXCEPTION_PREFIX = TIKA_META_PREFIX + "EXCEPTION" + NAMESPACE_PREFIX_DELIMITER;
/**
* Use this to store warnings that happened during the parse.
*/
String TIKA_META_WARN_PREFIX = TIKA_META_PREFIX + "WARN" + NAMESPACE_PREFIX_DELIMITER;
//exception in main file
Property CONTAINER_EXCEPTION =
Property.internalText(TIKA_META_EXCEPTION_PREFIX + "container_exception");
//exception in an embedded file
Property EMBEDDED_EXCEPTION =
Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
//exception handling the raw bytes of an embedded file by an EmbeddedDocumentByteStore
Property EMBEDDED_BYTES_EXCEPTION =
Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_bytes_exception");
//warning while parsing in an embedded file
Property EMBEDDED_WARNING =
Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_warning");
Property WRITE_LIMIT_REACHED =
Property.internalBoolean(TIKA_META_EXCEPTION_PREFIX + "write_limit_reached");
Property EMBEDDED_RESOURCE_LIMIT_REACHED =
Property.internalBoolean(TIKA_META_EXCEPTION_PREFIX + "embedded_resource_limit_reached");
Property EMBEDDED_DEPTH_LIMIT_REACHED =
Property.internalBoolean(TIKA_META_EXCEPTION_PREFIX + "embedded_depth_limit_reached");
/**
* Use this to store exceptions caught during a parse that are
* non-fatal, e.g. if a parser is in lenient mode and more
* content can be extracted if we ignore an exception thrown by
* a dependency.
*/
Property TIKA_META_EXCEPTION_WARNING =
Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "warn");
/**
* This means that metadata keys or metadata values were truncated.
* If there is an "include" filter, this should not be set if
* a field is not in the "include" set.
*/
Property TRUNCATED_METADATA =
Property.internalBoolean(TIKA_META_WARN_PREFIX + "truncated_metadata");
/**
* This indicates that only a portion of the file content was provided for detection.
* Detectors should check this flag and may adjust their behavior accordingly
* (e.g., not returning a detection result that requires reading to end of file).
*/
Property TRUNCATED_CONTENT_FOR_DETECTION =
Property.internalBoolean(TIKA_META_PREFIX + "truncated_content_for_detection");
/**
* When content is truncated for detection, this stores the number of bytes
* that were actually buffered for detection. This can be used by detectors
* to set appropriate mark limits.
*/
Property DETECTION_CONTENT_LENGTH =
Property.internalInteger(TIKA_META_PREFIX + "detection_content_length");
/**
* Use this to store exceptions caught while trying to read the
* stream of an embedded resource. Do not use this if there is
* a parse exception on the embedded resource.
*/
Property TIKA_META_EXCEPTION_EMBEDDED_STREAM =
Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_stream_exception");
Property TIKA_PARSED_BY = Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By");
/**
* Use this to store a record of all parsers that touched a given file
* in the container file's metadata.
*/
Property TIKA_PARSED_BY_FULL_SET = Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By-Full-Set");
Property TIKA_DETECTED_LANGUAGE = Property.externalTextBag(TIKA_META_PREFIX +
"detected_language");
Property TIKA_DETECTED_LANGUAGE_CONFIDENCE = Property.externalTextBag(TIKA_META_PREFIX +
"detected_language_confidence");
Property TIKA_DETECTED_LANGUAGE_CONFIDENCE_RAW = Property.externalRealSeq(TIKA_META_PREFIX +
"detected_language_confidence_raw");
Property RESOURCE_NAME_KEY = Property.internalText(TIKA_META_PREFIX + "resourceName");
/**
* Indicates that the file extension on the resource name was inferred by Tika
* (e.g., from content type detection) rather than provided by the original document.
*/
Property RESOURCE_NAME_EXTENSION_INFERRED =
Property.externalBoolean(TIKA_META_PREFIX + "resourceNameExtensionInferred");
Property EMBEDDED_RELATIONSHIP_ID = Property.internalText(TIKA_META_PREFIX + "embeddedRelationshipId");
String EMBEDDED_RESOURCE_TYPE_KEY = "embeddedResourceType";
/**
* Some file formats can store information about their original
* file name/location or about their attachment's original file name/location
* within the file.
*/
Property ORIGINAL_RESOURCE_NAME =
Property.internalTextBag(TIKA_META_PREFIX + "origResourceName");
/**
* This should be used to store the path (relative or full)
* of the source/container file, including the file name,
* e.g. doc/path/to/my_pdf.pdf
* <p>
* This can also be used for a primary key within a database.
*/
Property SOURCE_PATH = Property.internalText(TIKA_META_PREFIX + "sourcePath");
/**
* This records the metadata as stored within a file for an embedded file's path
* including the file name. For example a zip file may include an msg with this path: /my-emails/important/this.msg
*/
Property INTERNAL_PATH = Property.internalText(TIKA_META_PREFIX + "internalPath");
/**
* This is currently used to identify Content-Type that may be
* included within a document, such as in html documents
* (e.g. <meta http-equiv="content-type" content="text/html; charset=UTF-8">)
* , or the value might come from outside the document. This information
* may be faulty and should be treated only as a hint.
*/
Property CONTENT_TYPE_HINT = Property.internalText(HttpHeaders.CONTENT_TYPE + "-Hint");
/**
* This is used by users to override detection with the override detector.
*/
Property CONTENT_TYPE_USER_OVERRIDE =
Property.internalText(HttpHeaders.CONTENT_TYPE + "-Override");
/**
* This is used by parsers to override detection of embedded resources
* with the override detector.
*/
Property CONTENT_TYPE_PARSER_OVERRIDE =
Property.internalText(HttpHeaders.CONTENT_TYPE + "-Parser-Override");
/**
* This is set by DefaultDetector to store the result of MimeTypes (magic byte)
* detection. This allows downstream detectors to use it as a hint without
* re-running magic detection.
*/
Property CONTENT_TYPE_MAGIC_DETECTED =
Property.internalText(HttpHeaders.CONTENT_TYPE + "-Magic-Detected");
/**
* @see DublinCore#FORMAT
*/
Property FORMAT = DublinCore.FORMAT;
/**
* @see DublinCore#IDENTIFIER
*/
Property IDENTIFIER = DublinCore.IDENTIFIER;
/**
* @see DublinCore#CONTRIBUTOR
*/
Property CONTRIBUTOR = DublinCore.CONTRIBUTOR;
/**
* @see DublinCore#COVERAGE
*/
Property COVERAGE = DublinCore.COVERAGE;
/**
* @see DublinCore#CREATOR
*/
Property CREATOR = DublinCore.CREATOR;
/**
* @see Office#LAST_AUTHOR
*/
Property MODIFIER = Office.LAST_AUTHOR;
/**
* @see XMP#CREATOR_TOOL
*/
Property CREATOR_TOOL = XMP.CREATOR_TOOL;
/**
* @see DublinCore#LANGUAGE
*/
Property LANGUAGE = DublinCore.LANGUAGE;
/**
* @see DublinCore#PUBLISHER
*/
Property PUBLISHER = DublinCore.PUBLISHER;
/**
* @see DublinCore#RELATION
*/
Property RELATION = DublinCore.RELATION;
/**
* @see DublinCore#RIGHTS
*/
Property RIGHTS = DublinCore.RIGHTS;
/**
* @see DublinCore#SOURCE
*/
Property SOURCE = DublinCore.SOURCE;
/**
* @see DublinCore#TYPE
*/
Property TYPE = DublinCore.TYPE;
/**
* @see DublinCore#TITLE
*/
Property TITLE = DublinCore.TITLE;
// Descriptive properties
/**
* @see DublinCore#DESCRIPTION
*/
Property DESCRIPTION = DublinCore.DESCRIPTION;
/**
* {@link DublinCore#SUBJECT}; should include both subject and keywords
* if a document format has both. See also {@link Office#KEYWORDS}
* and {@link OfficeOpenXMLCore#SUBJECT}.
*/
Property SUBJECT = DublinCore.SUBJECT;
/**
* @see DublinCore#DATE
*/
Property CREATED = DublinCore.CREATED;
// Date related properties
/**
* @see DublinCore#MODIFIED
* @see Office#SAVE_DATE
*/
Property MODIFIED = DublinCore.MODIFIED;
/**
* @see Office#PRINT_DATE
*/
Property PRINT_DATE = Office.PRINT_DATE;
/**
* @see XMP#METADATA_DATE
*/
Property METADATA_DATE = XMP.METADATA_DATE;
/**
* @see Geographic#LATITUDE
*/
Property LATITUDE = Geographic.LATITUDE;
// Geographic related properties
/**
* @see Geographic#LONGITUDE
*/
Property LONGITUDE = Geographic.LONGITUDE;
/**
* @see Geographic#ALTITUDE
*/
Property ALTITUDE = Geographic.ALTITUDE;
/**
* @see XMP#RATING
*/
Property RATING = XMP.RATING;
/**
* This is the number of images (as in a multi-frame gif) returned by
* Java's {@link javax.imageio.ImageReader#getNumImages(boolean)}. See
* the javadocs for known limitations.
*/
Property NUM_IMAGES = Property.internalInteger("imagereader:NumImages");
// Comment and rating properties
/**
* @see OfficeOpenXMLExtended#COMMENTS
*/
Property COMMENTS = OfficeOpenXMLExtended.COMMENTS;
/**
* Embedded resource type property
*/
Property EMBEDDED_RESOURCE_TYPE = Property.internalClosedChoise(EMBEDDED_RESOURCE_TYPE_KEY,
EmbeddedResourceType.ATTACHMENT.toString(), EmbeddedResourceType.INLINE.toString(),
EmbeddedResourceType.METADATA.toString(), EmbeddedResourceType.MACRO.toString(),
EmbeddedResourceType.THUMBNAIL.toString(), EmbeddedResourceType.RENDERING.toString());
Property HAS_SIGNATURE = Property.internalBoolean("hasSignature");
Property SIGNATURE_NAME = Property.internalTextBag("signature:name");
Property SIGNATURE_DATE = Property.internalDateBag("signature:date");
Property SIGNATURE_LOCATION = Property.internalTextBag("signature:location");
Property SIGNATURE_REASON = Property.internalTextBag("signature:reason");
Property SIGNATURE_FILTER = Property.internalTextBag("signature:filter");
Property SIGNATURE_CONTACT_INFO = Property.internalTextBag("signature:contact-info");
//is the file encrypted
Property IS_ENCRYPTED = Property.internalBoolean(TIKA_META_PREFIX + "encrypted");
/**
* When an EncodingDetector detects an encoding, the encoding should be stored in this field.
* This is different from {@link Metadata#CONTENT_ENCODING} because that is what a parser
* chooses to use for processing a file. If an EncodingDetector returns "null", a parser
* may choose to use a default encoding. We want to differentiate between a parser using a
* default encoding and the output of an EncodingDetector.
*/
Property DETECTED_ENCODING = Property.externalText(TIKA_META_PREFIX + "detectedEncoding");
/**
* This should be the simple class name for the EncodingDetectors whose detected encoding
* was used in the parse.
*/
Property ENCODING_DETECTOR = Property.externalText(TIKA_META_PREFIX + "encodingDetector");
/**
* Diagnostic trace showing which encoding detectors ran and what each returned,
* plus the arbitration method used when detectors disagreed.
* Example: {@code "HtmlEncodingDetector->UTF-8, Icu4jEncodingDetector->windows-1256 (scored)"}
*/
Property ENCODING_DETECTION_TRACE =
Property.externalText(TIKA_META_PREFIX + "encodingDetectionTrace");
/**
* General metadata key for the count of non-final versions available within a file. This
* was added initially to support generalizing incremental updates in PDF.
*/
Property VERSION_COUNT = Property.externalInteger(TIKA_META_PREFIX + "versionCount");
/**
* General metadata key for the version number of a given file that contains
* earlier versions within it. This number is 0-indexed for the earliest version.
* The latest version does not have this metadata value. This was added initially
* to support generalizing incremental updates in PDF.
*/
Property VERSION_NUMBER = Property.externalInteger(TIKA_META_PREFIX + "versionNumber");
Property PIPES_RESULT = Property.externalText(TIKA_META_PREFIX + "pipes_result");
/**
* A file might contain different types of embedded documents.
* The most common is the ATTACHMENT.
* <p>
* An INLINE embedded resource should be used for embedded image
* files that are used to render the page image (as in PDXObjImages in PDF files).
* <p>
* A MACRO is code that is embedded in the document and is intended
* to be executable within the application that opens the document. This
* includes traditional macros within Microsoft Office files and
* javascript within PDFActions. This would not include, e.g., an
* .exe file embedded in a .zip file.
* <p>
* A VERSION is an earlier version of the file as in incremental updates.
* The initial use case for this is incremental updates in PDFs, but
* it could be applied to other file formats as well where earlier versions
* are recoverable. See also {@link PDF#INCREMENTAL_UPDATE_NUMBER}
* <p>
* Not all parsers have yet implemented this.
*/
enum EmbeddedResourceType {
INLINE, //image that is intended to be displayed in a rendering of the file
ATTACHMENT,//standard attachment as in email
MACRO, //any code that is intended to be run by the application
METADATA, //e.g. xmp, xfa
FONT,//embedded font files
THUMBNAIL, //TODO: set this in parsers that handle thumbnails
RENDERING, //if a file has been rendered
VERSION, //an earlier version of a file
ALTERNATE_FORMAT_CHUNK //OOXML inline alternate format chunk
}
}