CompoundCharacterTokenizer.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.fontbox.ttf.gsub;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.StringJoiner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Takes in the given text having compound-glyphs to substitute, and splits it into chunks consisting of parts that
* should be substituted and the ones that can be processed normally.
*
* @author Palash Ray
*
*/
public class CompoundCharacterTokenizer
{
private static final String GLYPH_ID_SEPARATOR = "_";
private final Pattern regexExpression;
/**
* Constructor. Calls getRegexFromTokens which returns strings like
* (_79_99_)|(_80_99_)|(_92_99_) and creates a regexp assigned to regexExpression. See the code
* in GlyphArraySplitterRegexImpl on how these strings were created.
* <p>
* It is assumed the compound words are sorted in descending order of length.
*
* @param compoundWords A set of strings like _79_99_, _80_99_ or _92_99_ .
*/
public CompoundCharacterTokenizer(Set<String> compoundWords)
{
validateCompoundWords(compoundWords);
regexExpression = Pattern.compile(getRegexFromTokens(compoundWords));
}
public CompoundCharacterTokenizer(Pattern pattern)
{
regexExpression = pattern;
}
/**
* Validate the compound words. They should not be null or empty and should start and end with
* the GLYPH_ID_SEPARATOR
*/
private void validateCompoundWords(Set<String> compoundWords)
{
if (compoundWords == null || compoundWords.isEmpty())
{
throw new IllegalArgumentException("Compound words cannot be null or empty");
}
// Ensure all word are starting and ending with the GLYPH_ID_SEPARATOR
compoundWords.forEach(word ->
{
if (!word.startsWith(GLYPH_ID_SEPARATOR) || !word.endsWith(GLYPH_ID_SEPARATOR))
{
throw new IllegalArgumentException(
"Compound words should start and end with " + GLYPH_ID_SEPARATOR);
}
});
}
/**
* Tokenize a string into tokens.
*
* @param text A string like "_66_71_71_74_79_70_"
* @return A list of tokens like "_66_", "_71_71_", "74_79_70_". The "_" is sometimes missing at
* the beginning or end, this has to be cleaned by the caller.
*/
public List<String> tokenize(String text)
{
List<String> tokens = new ArrayList<>();
Matcher regexMatcher = regexExpression.matcher(text);
int lastIndexOfPrevMatch = 0;
while (regexMatcher.find(lastIndexOfPrevMatch)) // this is where the magic happens:
// the regexp is used to find a matching pattern for substitution
{
int beginIndexOfNextMatch = regexMatcher.start();
String prevToken = text.substring(lastIndexOfPrevMatch, beginIndexOfNextMatch);
if (!prevToken.isEmpty())
{
tokens.add(prevToken);
}
String currentMatch = regexMatcher.group();
tokens.add(currentMatch);
lastIndexOfPrevMatch = regexMatcher.end();
if (lastIndexOfPrevMatch < text.length() && text.charAt(lastIndexOfPrevMatch) != '_')
{
// beause it is sometimes positioned after the "_", but it should be positioned
// before the "_"
--lastIndexOfPrevMatch;
}
}
String tail = text.substring(lastIndexOfPrevMatch);
if (!tail.isEmpty())
{
tokens.add(tail);
}
return tokens;
}
private String getRegexFromTokens(Set<String> compoundWords)
{
StringJoiner sj = new StringJoiner(")|(", "(", ")");
compoundWords.forEach(sj::add);
return sj.toString();
}
}