CMap.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.fontbox.cmap;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;
/**
 * This class represents a CMap file.
 *
 * @author Ben Litchfield
 */
public class CMap
{
    private static final Logger LOG = LogManager.getLogger(CMap.class);

    private int wmode = 0;
    private String cmapName = null;
    private String cmapVersion = null;
    private int cmapType = -1;

    private String registry = null;
    private String ordering = null;
    private int supplement = 0;

    private int minCodeLength = 4;
    private int maxCodeLength;

    private int minCidLength = 4;
    private int maxCidLength = 0;

    // code lengths
    private final List<CodespaceRange> codespaceRanges = new ArrayList<>();

    // Unicode mappings
    // one byte input values
    private final Map<Integer, String> charToUnicodeOneByte = new HashMap<>();
    // two byte input values
    private final Map<Integer, String> charToUnicodeTwoBytes = new HashMap<>();
    // 3 / 4 byte input values
    private final Map<Integer, String> charToUnicodeMoreBytes = new HashMap<>();

    // CID mappings
    // map with all code to cid mappings organized by the origin byte length of the input value
    private final Map<Integer, Map<Integer, Integer>> codeToCid = new HashMap<>();
    private final List<CIDRange> codeToCidRanges = new ArrayList<>();

    // inverted map
    private final Map <String, byte[]> unicodeToByteCodes = new HashMap<>();

    private static final String SPACE = " ";
    private int spaceMapping = -1;

    /**
     * Creates a new instance of CMap.
     */
    CMap()
    {
    }

    /**
     * This will tell if this cmap has any CID mappings.
     * 
     * @return true If there are any CID mappings, false otherwise.
     */
    public boolean hasCIDMappings()
    {
        return !codeToCid.isEmpty() || !codeToCidRanges.isEmpty();
    }

    /**
     * This will tell if this cmap has any Unicode mappings.
     *
     * @return true If there are any Unicode mappings, false otherwise.
     */
    public boolean hasUnicodeMappings()
    {
        return !charToUnicodeOneByte.isEmpty() || !charToUnicodeTwoBytes.isEmpty() || !charToUnicodeMoreBytes.isEmpty();
    }

    /**
     * Returns the sequence of Unicode characters for the given character code.
     *
     * This method exists for convenience. It may return false values as the origin byte length of the input value is
     * unknown and the mapping for some input values aren't unique. <br>
     * Example:<br>
     * The two byte value 0x00, 0x65 maps to 0x20 <br>
     * An input value of 0x65 always returns 0x20 even if the value has an origin byte length of 1.
     *
     * @param code character code
     * @return Unicode characters (may be more than one, e.g "fi" ligature)
     */
    public String toUnicode(int code)
    {
        String unicode = code < 256 ? toUnicode(code, 1) : null;
        if (unicode == null)
        {
            if (code <= 0xFFFF)
            {
                return toUnicode(code, 2);
            }
            if (code <= 0xFFFFFF)
            {
                return toUnicode(code, 3);
            }
            return toUnicode(code, 4);
        }
        return unicode;
    }

    /**
     * Returns the sequence of Unicode characters for the given character code.
     *
     * @param code character code
     * @param length code length
     * @return Unicode characters (may be more than one, e.g "fi" ligature)
     */
    public String toUnicode(int code, int length)
    {
        if (length == 1)
        {
            return charToUnicodeOneByte.get(code);
        }
        if (length == 2)
        {
            return charToUnicodeTwoBytes.get(code);
        }
        return charToUnicodeMoreBytes.get(code);
    }

    /**
     * Returns the sequence of Unicode characters for the given character code.
     *
     * @param code bytes of the character code
     * @return Unicode characters (may be more than one, e.g "fi" ligature)
     */
    public String toUnicode(byte[] code)
    {
        return toUnicode(toInt(code), code.length);
    }

    /**
     * Reads a character code from a string in the content stream.
     * <p>
     * See "CMap Mapping" and "Handling Undefined Characters" in PDF32000 for more details.
     *
     * @param in string stream
     * @return character code
     * @throws IOException if there was an error reading the stream or CMap
     */
    public int readCode(InputStream in) throws IOException
    {
        byte[] bytes = new byte[maxCodeLength];
        in.read(bytes,0,minCodeLength);
        in.mark(maxCodeLength);
        for (int i = minCodeLength-1; i < maxCodeLength; i++)
        {
            final int byteCount = i + 1;
            if (codespaceRanges.stream().anyMatch(r -> r.isFullMatch(bytes, byteCount)))
            {
                return toInt(bytes, byteCount);
            }
            if (byteCount < maxCodeLength)
            {
                bytes[byteCount] = (byte)in.read();
            }
        }
        if (LOG.isWarnEnabled())
        {
            StringBuilder sb = new StringBuilder();
            for (int i = 0; i < maxCodeLength; ++i)
            {
                sb.append(String.format("0x%02X (%04o) ", bytes[i], bytes[i]));
            }
            LOG.warn("Invalid character code sequence {} in CMap {}", sb, cmapName);
        }
        // PDFBOX-4811 reposition to where we were after initial read
        if (in.markSupported())
        {
            in.reset();
        }
        else
        {
            LOG.warn("mark() and reset() not supported, {} bytes have been skipped",
                    maxCodeLength - 1);
        }
        return toInt(bytes, minCodeLength); // Adobe Reader behavior
    }

    /**
     * Returns an int for the given byte array
     */
    static int toInt(byte[] data)
    {
        return toInt(data, data.length);
    }

    /**
     * Returns an int for the given byte array
     */
    private static int toInt(byte[] data, int dataLen)
    {
        int code = 0;
        for (int i = 0; i < dataLen; ++i)
        {
            code <<= 8;
            code |= (data[i] & 0xFF);
        }
        return code;
    }

    /**
     * Returns the CID for the given character code.
     *
     * @param code character code as byte array
     * @return CID
     */
    public int toCID(byte[] code)
    {
        if (!hasCIDMappings() || code.length < minCidLength || code.length > maxCidLength)
        {
            return 0;
        }
        Integer cid = null;
        if (codeToCid.containsKey(code.length))
        {
            cid = codeToCid.get(code.length).get(toInt(code));
        }
        if (cid == null)
        {
            cid = toCIDFromRanges(code);
        }
        return cid;
    }

    /**
     * Returns the CID for the given character code.
     * 
     * This method exists for convenience. It may return false values as the origin byte length of the input value is
     * unknown and the mapping for some input values aren't unique. <br>
     * Example:<br>
     * The two byte value 0x00, 0x65 maps to 0x20 <br>
     * An input value of 0x65 always returns 0x20 even if the value has an origin byte length of 1.
     *
     * @param code character code
     * @return CID
     */
    public int toCID(int code)
    {
        if (!hasCIDMappings())
        {
            return 0;
        }
        int cid = 0;
        int length = minCidLength;
        while (cid == 0 && (length <= maxCidLength))
        {
            cid = toCID(code, length++);
        }
        return cid;
    }

    /**
     * Returns the CID for the given character code.
     *
     * @param code   character code
     * @param length the origin byte length of the code
     * @return CID
     */
    public int toCID(int code, int length)
    {
        if (!hasCIDMappings() || length < minCidLength || length > maxCidLength)
        {
            return 0;
        }
        Integer cid = null;
        if (codeToCid.containsKey(length))
        {
            cid = codeToCid.get(length).get(code);
        }
        return cid != null ? cid : toCIDFromRanges(code, length);
    }

    /**
     * Returns the CID for the given character code.
     *
     * @param code character code
     * @return CID
     */

    private int toCIDFromRanges(int code, int length)
    {
        for (CIDRange range : codeToCidRanges)
        {
            int ch = range.map(code, length);
            if (ch != -1)
            {
                return ch;
            }
        }
        return 0;
    }

    /**
     * Returns the CID for the given character code.
     *
     * @param code character code
     * @return CID
     */

    private int toCIDFromRanges(byte[] code)
    {
        for (CIDRange range : codeToCidRanges)
        {
            int ch = range.map(code);
            if (ch != -1)
            {
                return ch;
            }
        }
        return 0;
    }

    /**
     * This will add a character code to Unicode character sequence mapping.
     *
     * @param codes The character codes to map from.
     * @param unicode The Unicode characters to map to.
     */
    void addCharMapping(byte[] codes, String unicode)
    {
        switch (codes.length)
        {
            case 1:
                charToUnicodeOneByte.put(CMapStrings.getIndexValue(codes), unicode);
                unicodeToByteCodes.put(unicode, CMapStrings.getByteValue(codes));
                break;
            case 2:
                charToUnicodeTwoBytes.put(CMapStrings.getIndexValue(codes), unicode);
                unicodeToByteCodes.put(unicode, CMapStrings.getByteValue(codes));
                break;
            case 3:
            case 4:
                charToUnicodeMoreBytes.put(toInt(codes), unicode);
                unicodeToByteCodes.put(unicode, codes.clone());
                break;
            default:
                LOG.warn("Mappings with more than 4 bytes (here: {}) aren't supported yet", codes.length);
                break;
        }
        // fixme: ugly little hack
        if (SPACE.equals(unicode))
        {
            spaceMapping = toInt(codes);
        }
    }

    /**
     * Get the code bytes for an unicode string.
     *
     * @param unicode The unicode string.
     * @return the code bytes or null if there is none.
     */
    public byte[] getCodesFromUnicode(String unicode)
    {
        return unicodeToByteCodes.get(unicode);
    }

    /**
     * This will add a CID mapping.
     *
     * @param code character code
     * @param cid CID
     */
    void addCIDMapping(byte[] code, int cid)
    {
        Map<Integer, Integer> codeToCidMap = codeToCid.get(code.length);
        if (codeToCidMap == null)
        {
            codeToCidMap = new HashMap<>();
            codeToCid.put(code.length, codeToCidMap);
            minCidLength = Math.min(minCidLength, code.length);
            maxCidLength = Math.max(maxCidLength, code.length);
        }
        codeToCidMap.put(toInt(code), cid);
    }

    /**
     * This will add a CID Range.
     *
     * @param from starting character of the CID range.
     * @param to ending character of the CID range.
     * @param cid the cid to be started with.
     *
     */
    void addCIDRange(byte[] from, byte[] to, int cid)
    {
        addCIDRange(codeToCidRanges, toInt(from), toInt(to), cid, from.length);
    }

    private void addCIDRange(List<CIDRange> cidRanges, int from, int to, int cid, int length)
    {
        CIDRange lastRange = null;
        if (!cidRanges.isEmpty())
        {
            lastRange = cidRanges.get(cidRanges.size() - 1);
        }
        if (lastRange == null || !lastRange.extend(from, to, cid, length))
        {
            cidRanges.add(new CIDRange(from, to, cid, length));
            minCidLength = Math.min(minCidLength, length);
            maxCidLength = Math.max(maxCidLength, length);
        }
    }

    /**
     * This will add a codespace range.
     *
     * @param range A single codespace range.
     */
    void addCodespaceRange( CodespaceRange range )
    {
        codespaceRanges.add(range);
        maxCodeLength = Math.max(maxCodeLength, range.getCodeLength());
        minCodeLength = Math.min(minCodeLength, range.getCodeLength());
    }
    
    /**
     * Implementation of the usecmap operator.  This will
     * copy all of the mappings from one cmap to another.
     * 
     * @param cmap The cmap to load mappings from.
     */
    void useCmap(CMap cmap)
    {
        cmap.codespaceRanges.forEach(this::addCodespaceRange);
        charToUnicodeOneByte.putAll(cmap.charToUnicodeOneByte);
        charToUnicodeTwoBytes.putAll(cmap.charToUnicodeTwoBytes);
        charToUnicodeMoreBytes.putAll(cmap.charToUnicodeMoreBytes);
        cmap.charToUnicodeOneByte.forEach((k, v) -> unicodeToByteCodes.put(v, new byte[]{(byte) (k % 0xFF)}));
        cmap.charToUnicodeTwoBytes.forEach((k, v) -> unicodeToByteCodes.put(v,
                new byte[]{(byte) ((k >>> 8) & 0xFF), (byte) (k & 0xFF)})
        );
        cmap.charToUnicodeMoreBytes.forEach((k, v) -> 
            {
                byte[] bar;
                if (k <= 0xFFFFFF)
                {
                    // 3 bytes
                    bar = new byte[]{(byte) ((k >>> 16) & 0xFF), (byte) ((k >>> 8) & 0xFF), 
                        (byte) (k & 0xFF)};
                }
                else
                {
                    // 4 bytes
                    bar = new byte[]{(byte) ((k >>> 24) & 0xFF), (byte) ((k >>> 16) & 0xFF),
                        (byte) ((k >>> 8) & 0xFF), (byte) (k & 0xFF)};
                }
                unicodeToByteCodes.put(v, bar);
            });
        cmap.codeToCid.forEach((key, value) ->
        {
            Map<Integer, Integer> existingMapping = codeToCid.putIfAbsent(key, value);
            if (existingMapping!=null)
            {
                existingMapping.putAll(value);
            }
        });
        codeToCidRanges.addAll(cmap.codeToCidRanges);
        maxCodeLength = Math.max(maxCodeLength, cmap.maxCodeLength);
        minCodeLength = Math.min(minCodeLength, cmap.minCodeLength);
        maxCidLength = Math.max(maxCidLength, cmap.maxCidLength);
        minCidLength = Math.min(minCidLength, cmap.minCidLength);
    }

    /**
     * Returns the WMode of a CMap.
     *
     * 0 represents a horizontal and 1 represents a vertical orientation.
     * 
     * @return the wmode
     */
    public int getWMode() 
    {
        return wmode;
    }

    /**
     * Sets the WMode of a CMap.
     * 
     * @param newWMode the new WMode.
     */
    public void setWMode(int newWMode) 
    {
        wmode = newWMode;
    }

    /**
     * Returns the name of the CMap.
     * 
     * @return the CMap name.
     */
    public String getName() 
    {
        return cmapName;
    }

    /**
     * Sets the name of the CMap.
     * 
     * @param name the CMap name.
     */
    public void setName(String name) 
    {
        cmapName = name;
    }

    /**
     * Returns the version of the CMap.
     * 
     * @return the CMap version.
     */
    public String getVersion() 
    {
        return cmapVersion;
    }

    /**
     * Sets the version of the CMap.
     * 
     * @param version the CMap version.
     */
    public void setVersion(String version) 
    {
        cmapVersion = version;
    }

    /**
     * Returns the type of the CMap.
     * 
     * @return the CMap type.
     */
    public int getType() 
    {
        return cmapType;
    }

    /**
     * Sets the type of the CMap.
     * 
     * @param type the CMap type.
     */
    public void setType(int type) 
    {
        cmapType = type;
    }

    /**
     * Returns the registry of the CIDSystemInfo.
     * 
     * @return the registry.
     */
    public String getRegistry() 
    {
        return registry;
    }

    /**
     * Sets the registry of the CIDSystemInfo.
     * 
     * @param newRegistry the registry.
     */
    public void setRegistry(String newRegistry) 
    {
        registry = newRegistry;
    }

    /**
     * Returns the ordering of the CIDSystemInfo.
     * 
     * @return the ordering.
     */
    public String getOrdering() 
    {
        return ordering;
    }

    /**
     * Sets the ordering of the CIDSystemInfo.
     * 
     * @param newOrdering the ordering.
     */
    public void setOrdering(String newOrdering) 
    {
        ordering = newOrdering;
    }

    /**
     * Returns the supplement of the CIDSystemInfo.
     * 
     * @return the supplement.
     */
    public int getSupplement() 
    {
        return supplement;
    }

    /**
     * Sets the supplement of the CIDSystemInfo.
     * 
     * @param newSupplement the supplement.
     */
    public void setSupplement(int newSupplement) 
    {
        supplement = newSupplement;
    }
    
    /** 
     * Returns the mapping for the space character.
     * 
     * @return the mapped code for the space character
     */
    public int getSpaceMapping()
    {
        return spaceMapping;
    }

    @Override
    public String toString()
    {
        return cmapName;
    }
}