Metadata.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.metadata;

import static org.apache.tika.utils.DateUtils.formatDate;

import java.io.Serializable;
import java.text.DateFormat;
import java.text.DateFormatSymbols;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Date;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Properties;
import java.util.TimeZone;

import org.apache.tika.metadata.Property.PropertyType;
import org.apache.tika.metadata.writefilter.MetadataWriteLimiter;
import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.utils.DateUtils;

/**
 * A multi-valued metadata container.
 */
public class Metadata
        implements CreativeCommons, Geographic, HttpHeaders, Message, ClimateForcast, TIFF,
        TikaMimeKeys, Serializable {


    private static final MetadataWriteLimiter ACCEPT_ALL = new MetadataWriteLimiter() {
        @Override
        public void add(String field, String value, Map<String, String[]> data) {
            String[] values = data.get(field);
            if (values == null) {
                set(field, value, data);
            } else {
                data.put(field, appendValues(values, value));
            }
        }

        //legacy behavior -- remove the field if value is null
        @Override
        public void set(String field, String value, Map<String, String[]> data) {
            if (value != null) {
                data.put(field, new String[]{ value });
            } else {
                data.remove(field);
            }
        }

        private String[] appendValues(String[] values, final String value) {
            if (value == null) {
                return values;
            }
            String[] newValues = new String[values.length + 1];
            System.arraycopy(values, 0, newValues, 0, values.length);
            newValues[newValues.length - 1] = value;
            return newValues;
        }
    };

    /**
     * Serial version UID
     */
    private static final long serialVersionUID = 5623926545693153182L;
    /**
     * Some parsers will have the date as a ISO-8601 string
     * already, and will set that into the Metadata object.
     */
    private static final DateUtils DATE_UTILS = new DateUtils();
    /**
     * A map of all metadata attributes.
     */
    private Map<String, String[]> metadata = null;


    private MetadataWriteLimiter writeLimiter = ACCEPT_ALL;
    /**
     * Constructs a new, empty metadata.
     */
    public Metadata() {
        metadata = new HashMap<>();
    }

    /**
     * Constructs a new, empty metadata with the specified write limiter.
     * The limiter will be applied to all subsequent writes.
     *
     * @param writeLimiter the limiter to apply to metadata writes, or null for no limits
     * @since Apache Tika 4.0
     */
    public Metadata(MetadataWriteLimiter writeLimiter) {
        metadata = new HashMap<>();
        this.writeLimiter = writeLimiter != null ? writeLimiter : ACCEPT_ALL;
    }

    /**
     * Creates a new Metadata instance configured from the ParseContext.
     * <p>
     * If a {@link MetadataWriteLimiterFactory} is present in the context, the returned
     * Metadata will have a write limiter that enforces those limits. Otherwise,
     * returns a plain Metadata object with no limits.
     * <p>
     * Parsers should use this method instead of {@code new Metadata()} when creating
     * metadata for embedded documents, to ensure limits are applied at creation time.
     *
     * @param context the ParseContext (may be null)
     * @return a new Metadata instance configured from the context
     * @since Apache Tika 4.0
     */
    public static Metadata newInstance(ParseContext context) {
        if (context == null) {
            return new Metadata();
        }
        MetadataWriteLimiterFactory factory = context.get(MetadataWriteLimiterFactory.class);
        return factory != null ? new Metadata(factory.newInstance()) : new Metadata();
    }

    private static DateFormat createDateFormat(String format, TimeZone timezone) {
        SimpleDateFormat sdf = new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
        if (timezone != null) {
            sdf.setTimeZone(timezone);
        }
        return sdf;
    }

    /**
     * Parses the given date string. This method is synchronized to prevent
     * concurrent access to the thread-unsafe date formats.
     *
     * @param date date string
     * @return parsed date, or <code>null</code> if the date can't be parsed
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-495">TIKA-495</a>
     */
    private static synchronized Date parseDate(String date) {
        return DATE_UTILS.tryToParse(date);
    }

    /**
     * Returns true if named value is multivalued.
     *
     * @param property metadata property
     * @return true is named value is multivalued, false if single value or null
     */
    public boolean isMultiValued(final Property property) {
        return metadata.get(property.getName()) != null &&
                metadata.get(property.getName()).length > 1;
    }

    /**
     * Returns true if named value is multivalued.
     *
     * @param name name of metadata
     * @return true is named value is multivalued, false if single value or null
     */
    public boolean isMultiValued(final String name) {
        return metadata.get(name) != null && metadata.get(name).length > 1;
    }

    /**
     * Returns an array of the names contained in the metadata.
     *
     * @return Metadata names
     */
    public String[] names() {
        return metadata.keySet().toArray(new String[0]);
    }

    /**
     * Get the value associated to a metadata name. If many values are associated to the specified
     * name, then the first one is returned.
     *
     * @param name of the metadata.
     * @return the value associated to the specified metadata name.
     */
    public String get(final String name) {
        String[] values = metadata.get(name);
        if (values == null) {
            return null;
        } else {
            return values[0];
        }
    }

    /**
     * Returns the value (if any) of the identified metadata property. If many values are associated
     * to the specified property, then the first one is returned.
     *
     * @param property property definition
     * @return property value, or <code>null</code> if the property is not set
     * @since Apache Tika 0.7
     */
    public String get(Property property) {
        return get(property.getName());
    }

    /**
     * Returns the value of the identified Integer based metadata property. If many values are
     * associated to the specified property, then the first one is returned.
     *
     * @param property simple integer property definition
     * @return property value as a Integer, or <code>null</code> if the property is not set, or
     * not a valid Integer
     * @since Apache Tika 0.8
     */
    public Integer getInt(Property property) {
        if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) {
            return null;
        }
        if (property.getPrimaryProperty().getValueType() != Property.ValueType.INTEGER) {
            return null;
        }

        String v = get(property);
        if (v == null) {
            return null;
        }
        try {
            return Integer.valueOf(v);
        } catch (NumberFormatException e) {
            return null;
        }
    }

    /**
     * Returns the value of the identified Date based metadata property. If many values are
     * associated to the specified property, then the first one is returned.
     *
     * @param property simple date property definition
     * @return property value as a Date, or <code>null</code> if the property is not set, or not
     * a valid Date
     * @since Apache Tika 0.8
     */
    public Date getDate(Property property) {
        if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) {
            return null;
        }
        if (property.getPrimaryProperty().getValueType() != Property.ValueType.DATE) {
            return null;
        }

        String v = get(property);
        if (v != null) {
            return parseDate(v);
        } else {
            return null;
        }
    }

    /**
     * Get the values associated to a metadata name.
     *
     * @param property of the metadata.
     * @return the values associated to a metadata name.
     */
    public String[] getValues(final Property property) {
        return _getValues(property.getName());
    }

    /**
     * Get the values associated to a metadata name.
     *
     * @param name of the metadata.
     * @return the values associated to a metadata name.
     */
    public String[] getValues(final String name) {
        return _getValues(name);
    }

    private String[] _getValues(final String name) {
        String[] values = metadata.get(name);
        if (values == null) {
            values = new String[0];
        }
        return values;
    }

    /**
     * Add a metadata name/value mapping. Add the specified value to the list of
     * values associated to the specified metadata name.
     *
     * @param name  the metadata name.
     * @param value the metadata value.
     */
    public void add(final String name, final String value) {
        writeLimiter.add(name, value, metadata);
    }

    /**
     * Add a metadata name/value mapping. Add the specified value to the list of
     * values associated to the specified metadata name.
     *
     * @param name  the metadata name.
     * @param newValues the metadata values
     */
    protected void add(final String name, final String[] newValues) {
        String[] values = metadata.get(name);
        if (values == null) {
            set(name, newValues);
        } else {
            for (String val : newValues) {
                add(name, val);
            }
        }
    }

    /**
     * Add a metadata property/value mapping. Add the specified value to the list of
     * values associated to the specified metadata property.
     *
     * @param property the metadata property.
     * @param value    the metadata value.
     */
    public void add(final Property property, final String value) {

        if (property == null) {
            throw new NullPointerException("property must not be null");
        }
        if (property.getPropertyType() == PropertyType.COMPOSITE) {
            add(property.getPrimaryProperty(), value);
            if (property.getSecondaryExtractProperties() != null) {
                for (Property secondaryExtractProperty : property.getSecondaryExtractProperties()) {
                    add(secondaryExtractProperty, value);
                }
            }
        } else {
            String[] values = metadata.get(property.getName());

            if (values == null) {
                set(property, value);
            } else {
                if (property.isMultiValuePermitted()) {
                    add(property.getName(), value);
                } else {
                    throw new PropertyTypeException(
                            property.getName() + " : " + property.getPropertyType());
                }
            }
        }
    }

    /**
     * Copy All key-value pairs from properties.
     *
     * @param properties properties to copy from
     */
    @SuppressWarnings("unchecked")
    public void setAll(Properties properties) {
        Enumeration<String> names = (Enumeration<String>) properties.propertyNames();
        while (names.hasMoreElements()) {
            String name = names.nextElement();
            metadata.put(name, new String[]{properties.getProperty(name)});
        }
    }

    /**
     * Set metadata name/value. Associate the specified value to the specified
     * metadata name. If some previous values were associated to this name,
     * they are removed. If the given value is <code>null</code>, then the
     * metadata entry is removed.
     *
     * @param name  the metadata name.
     * @param value the metadata value, or <code>null</code>
     */
    public void set(String name, String value) {
        writeLimiter.set(name, value, metadata);
    }

    protected void set(String name, String[] values) {
        //TODO: optimize this to not copy if all
        //values are to be included "as is"
        if (values != null) {
            metadata.remove(name);
            for (String v : values) {
                add(name, v);
            }
        } else {
            metadata.remove(name);
        }
    }

    /**
     * Sets the value of the identified metadata property.
     *
     * @param property property definition
     * @param value    property value
     * @since Apache Tika 0.7
     */
    public void set(Property property, String value) {
        if (property == null) {
            throw new NullPointerException("property must not be null");
        }
        if (property.getPropertyType() == PropertyType.COMPOSITE) {
            set(property.getPrimaryProperty(), value);
            if (property.getSecondaryExtractProperties() != null) {
                for (Property secondaryExtractProperty : property.getSecondaryExtractProperties()) {
                    set(secondaryExtractProperty, value);
                }
            }
        } else {
            set(property.getName(), value);
        }
    }

    /**
     * Sets the values of the identified metadata property.
     *
     * @param property property definition
     * @param values   property values
     * @since Apache Tika 1.2
     */
    public void set(Property property, String[] values) {
        if (property == null) {
            throw new NullPointerException("property must not be null");
        }
        if (property.getPropertyType() == PropertyType.COMPOSITE) {
            set(property.getPrimaryProperty(), values);
            if (property.getSecondaryExtractProperties() != null) {
                for (Property secondaryExtractProperty : property.getSecondaryExtractProperties()) {
                    set(secondaryExtractProperty, values);
                }
            }
        } else {
            set(property.getName(), values);
        }
    }

    /**
     * Sets the integer value of the identified metadata property.
     *
     * @param property simple integer property definition
     * @param value    property value
     * @since Apache Tika 0.8
     */
    public void set(Property property, int value) {
        if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) {
            throw new PropertyTypeException(Property.PropertyType.SIMPLE,
                    property.getPrimaryProperty().getPropertyType());
        }
        if (property.getPrimaryProperty().getValueType() != Property.ValueType.INTEGER) {
            throw new PropertyTypeException(Property.ValueType.INTEGER,
                    property.getPrimaryProperty().getValueType());
        }
        set(property, Integer.toString(value));
    }

    /**
     * Sets the integer value of the identified metadata property.
     *
     * @param property simple integer property definition
     * @param value    property value
     * @since Apache Tika 0.8
     */
    public void set(Property property, long value) {
        if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) {
            throw new PropertyTypeException(Property.PropertyType.SIMPLE,
                    property.getPrimaryProperty().getPropertyType());
        }
        if (property.getPrimaryProperty().getValueType() != Property.ValueType.REAL) {
            throw new PropertyTypeException(Property.ValueType.REAL,
                    property.getPrimaryProperty().getValueType());
        }
        set(property, Long.toString(value));
    }
    /**
     * Sets the integer value of the identified metadata property.
     *
     * @param property simple integer property definition
     * @param value    property value
     * @since Apache Tika 2.1.1
     */
    public void set(Property property, boolean value) {
        if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) {
            throw new PropertyTypeException(Property.PropertyType.SIMPLE,
                    property.getPrimaryProperty().getPropertyType());
        }
        if (property.getPrimaryProperty().getValueType() != Property.ValueType.BOOLEAN) {
            throw new PropertyTypeException(Property.ValueType.BOOLEAN,
                    property.getPrimaryProperty().getValueType());
        }
        set(property, Boolean.toString(value));
    }

    /**
     * Adds the integer value of the identified metadata property.
     *
     * @param property seq integer property definition
     * @param value    property value
     * @since Apache Tika 1.21
     */
    public void add(Property property, int value) {
        if (property.getPrimaryProperty().getPropertyType() != PropertyType.SEQ) {
            throw new PropertyTypeException(PropertyType.SEQ,
                    property.getPrimaryProperty().getPropertyType());
        }
        if (property.getPrimaryProperty().getValueType() != Property.ValueType.INTEGER) {
            throw new PropertyTypeException(Property.ValueType.INTEGER,
                    property.getPrimaryProperty().getValueType());
        }
        add(property, Integer.toString(value));
    }

    /**
     * Gets the array of ints of the identified "seq" integer metadata property.
     *
     * @param property seq integer property definition
     * @return array of ints
     * @since Apache Tika 1.21
     */
    public int[] getIntValues(Property property) {
        if (property.getPrimaryProperty().getPropertyType() != PropertyType.SEQ) {
            throw new PropertyTypeException(PropertyType.SEQ,
                    property.getPrimaryProperty().getPropertyType());
        }
        if (property.getPrimaryProperty().getValueType() != Property.ValueType.INTEGER) {
            throw new PropertyTypeException(Property.ValueType.INTEGER,
                    property.getPrimaryProperty().getValueType());
        }
        String[] vals = getValues(property);
        int[] ret = new int[vals.length];
        for (int i = 0; i < vals.length; i++) {
            ret[i] = Integer.parseInt(vals[i]);
        }
        return ret;
    }

    /**
     * Gets the array of ints of the identified "seq" integer metadata property.
     *
     * @param property seq integer property definition
     * @return array of ints
     * @since Apache Tika 1.21
     */
    public long[] getLongValues(Property property) {
        if (property.getPrimaryProperty().getPropertyType() != PropertyType.SEQ) {
            throw new PropertyTypeException(PropertyType.SEQ,
                    property.getPrimaryProperty().getPropertyType());
        }
        if (property.getPrimaryProperty().getValueType() != Property.ValueType.REAL) {
            throw new PropertyTypeException(Property.ValueType.REAL,
                    property.getPrimaryProperty().getValueType());
        }
        String[] vals = getValues(property);
        long[] ret = new long[vals.length];
        for (int i = 0; i < vals.length; i++) {
            ret[i] = Long.parseLong(vals[i]);
        }
        return ret;
    }

    /**
     * Sets the real or rational value of the identified metadata property.
     *
     * @param property simple real or simple rational property definition
     * @param value    property value
     * @since Apache Tika 0.8
     */
    public void set(Property property, double value) {
        if (property.getPrimaryProperty().getValueType() != Property.ValueType.REAL &&
                property.getPrimaryProperty().getValueType() != Property.ValueType.RATIONAL) {
            throw new PropertyTypeException(Property.ValueType.REAL,
                    property.getPrimaryProperty().getValueType());
        }
        set(property, Double.toString(value));
    }

    /**
     * Sets the date value of the identified metadata property.
     *
     * @param property simple integer property definition
     * @param date     property value
     * @since Apache Tika 0.8
     */
    public void set(Property property, Date date) {
        if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) {
            throw new PropertyTypeException(Property.PropertyType.SIMPLE,
                    property.getPrimaryProperty().getPropertyType());
        }
        if (property.getPrimaryProperty().getValueType() != Property.ValueType.DATE) {
            throw new PropertyTypeException(Property.ValueType.DATE,
                    property.getPrimaryProperty().getValueType());
        }
        String dateString = null;
        if (date != null) {
            dateString = formatDate(date);
        }
        set(property, dateString);
    }

    /**
     * Sets the date value of the identified metadata property.
     *
     * @param property simple integer property definition
     * @param date     property value
     * @since Apache Tika 0.8
     */
    public void set(Property property, Calendar date) {
        if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) {
            throw new PropertyTypeException(Property.PropertyType.SIMPLE,
                    property.getPrimaryProperty().getPropertyType());
        }
        if (property.getPrimaryProperty().getValueType() != Property.ValueType.DATE) {
            throw new PropertyTypeException(Property.ValueType.DATE,
                    property.getPrimaryProperty().getValueType());
        }
        String dateString = null;
        if (date != null) {
            dateString = formatDate(date);
        }
        set(property, dateString);
    }

    /**
     * Adds the date value of the identified metadata property.
     *
     * @param property simple calendar property definition
     * @param date     property value
     * @since Apache Tika 2.5.0
     */
    public void add(Property property, Calendar date) {
        if (property.getPrimaryProperty().getValueType() != Property.ValueType.DATE) {
            throw new PropertyTypeException(Property.ValueType.DATE,
                    property.getPrimaryProperty().getValueType());
        }
        String dateString = null;
        if (date != null) {
            dateString = formatDate(date);
        }
        add(property, dateString);
    }

    /**
     * Remove a metadata and all its associated values.
     *
     * @param name metadata name to remove
     */
    public void remove(String name) {
        metadata.remove(name);
    }

    /**
     * Returns the number of metadata names in this metadata.
     *
     * @return number of metadata names
     */
    public int size() {
        return metadata.size();
    }

    public int hashCode() {
        int h = 0;
        for (Entry<String, String[]> stringEntry : metadata.entrySet()) {
            h += getMetadataEntryHashCode(stringEntry);
        }
        return h;
    }

    private int getMetadataEntryHashCode(Entry<String, String[]> e) {
        return Objects.hashCode(e.getKey()) ^ Arrays.hashCode(e.getValue());
    }

    public boolean equals(Object o) {

        if (!(o instanceof Metadata)) {
            return false;
        }

        Metadata other = (Metadata) o;

        if (other.size() != size()) {
            return false;
        }

        String[] names = names();
        for (String name : names) {
            String[] otherValues = other._getValues(name);
            String[] thisValues = _getValues(name);
            if (otherValues.length != thisValues.length) {
                return false;
            }
            for (int j = 0; j < otherValues.length; j++) {
                if (!otherValues[j].equals(thisValues[j])) {
                    return false;
                }
            }
        }
        return true;
    }

    public String toString() {
        StringBuilder stringBuilder = new StringBuilder();
        String[] names = names();
        for (String name : names) {
            String[] values = _getValues(name);
            for (String value : values) {
                if (stringBuilder.length() > 0) {
                    stringBuilder.append(" ");
                }
                stringBuilder.append(name).append("=").append(value);
            }
        }
        return stringBuilder.toString();
    }
}