DWGReadFormatRemover.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.dwg;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* DWGReadFormatRemover removes the formatting from the text from libredwg files so only
* the raw text remains.
* What needs to be cleaned has been found on the following websites:
* <p>
* <a href="https://www.cadforum.cz/en/text-formatting-codes-in-mtext-objects-tip8640">
* https://www.cadforum.cz/en/text-formatting-codes-in-mtext-objects-tip8640</a>
* <p>
* <a href="https://adndevblog.typepad.com/autocad/2017/09/dissecting-mtext-format-codes.html">
* https://adndevblog.typepad.com/autocad/2017/09/dissecting-mtext-format-codes.html</a>
* <p>
*/
public class DWGReadFormatRemover {
private static final String underlineStrikeThrough = "((?:\\\\\\\\)+|\\\\[LlOoKk])";
private static final String endMarks = "((?:\\\\\\\\)+|\\\\(?:A|H|pi|pxt|pxi|pt|X|Q|f|F|W|C|T)[^;]{0,100};)";
private static final String newLine = "((?:\\\\\\\\)+|\\\\P)";
private static final String stackFrac = "(\\\\\\\\)+|\\\\S([^/^#]{1,20})[/^#]([^;]{1,20});";
private static final String curlyBraces = "(\\\\)+[{}]|([{}])";
private static final String escapeChars = "(?<!\\\\)(\\\\)(?!\\\\)";
public String cleanupDwgString(String dwgString) {
String cleanString = dwgString;
StringBuilder sb = new StringBuilder();
//Strip off start/stop underline/overstrike/strike throughs
Matcher m = Pattern.compile(underlineStrikeThrough).matcher(cleanString);
while (m.find()) {
if (! m.group(1).endsWith("\\")) {
m.appendReplacement(sb, "");
}
}
m.appendTail(sb);
cleanString = sb.toString();
//Strip off semi-colon ended markers
m = Pattern.compile(endMarks).matcher(cleanString);
sb.setLength(0);
while (m.find()) {
if (! m.group(1).endsWith("\\")) {
m.appendReplacement(sb, "");
}
}
m.appendTail(sb);
cleanString = sb.toString();
//new line marker \\P replace with actual new line
m = Pattern.compile(newLine).matcher(cleanString);
sb.setLength(0);
while (m.find()) {
if (m.group(1).endsWith("P")) {
m.appendReplacement(sb, "\n");
}
}
m.appendTail(sb);
cleanString = sb.toString();
//stacking fractions
m = Pattern.compile(stackFrac).matcher(cleanString);
sb.setLength(0);
while (m.find()) {
if (m.group(1) == null) {
m.appendReplacement(sb, m.group(2) + "/" + m.group(3));
}
}
m.appendTail(sb);
cleanString = sb.toString();
//strip brackets around text, make sure they aren't escaped
m = Pattern.compile(curlyBraces).matcher(cleanString);
sb.setLength(0);
while (m.find()) {
if (m.group(1) == null) {
m.appendReplacement(sb, "");
}
}
m.appendTail(sb);
cleanString = sb.toString();
//now get rid of escape characters
cleanString = cleanString.replaceAll(escapeChars, "");
//now unescape backslash
cleanString = cleanString.replaceAll("(\\\\\\\\)", "\\\\");
return cleanString;
}
}