SourceCodeParser.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.code;
import static org.codelibs.jhighlight.renderer.XhtmlRendererFactory.CPP;
import static org.codelibs.jhighlight.renderer.XhtmlRendererFactory.GROOVY;
import static org.codelibs.jhighlight.renderer.XhtmlRendererFactory.JAVA;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.XMLConstants;
import org.codelibs.jhighlight.renderer.Renderer;
import org.codelibs.jhighlight.renderer.XhtmlRendererFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeFilter;
import org.jsoup.select.NodeTraversor;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.detect.AutoDetectReader;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractEncodingDetectorParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
/**
* Generic Source code parser for Java, Groovy, C++.
* Aware: This parser uses JHightlight library (https://github.com/codelibs/jhighlight) under CDDL/LGPL dual license
*
* @author Hong-Thai.Nguyen
* @since 1.6
*/
@TikaComponent
public class SourceCodeParser extends AbstractEncodingDetectorParser {
private static final long serialVersionUID = -4543476498190054160L;
private static final Pattern AUTHORPATTERN = Pattern.compile("(?im)@author (.*) *$");
private static final Map<MediaType, String> TYPES_TO_RENDERER = new HashMap<MediaType, String>() {
private static final long serialVersionUID = -741976157563751152L;
{
put(MediaType.text("x-c++src"), CPP);
put(MediaType.text("x-java-source"), JAVA);
put(MediaType.text("x-groovy"), GROOVY);
}
};
public SourceCodeParser() {
super();
}
public SourceCodeParser(EncodingDetector encodingDetector) {
super(encodingDetector);
}
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return TYPES_TO_RENDERER.keySet();
}
@Override
public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
tis.setCloseShield();
try (AutoDetectReader reader = new AutoDetectReader(tis,
metadata, getEncodingDetector(context))) {
Charset charset = reader.getCharset();
String mediaType = metadata.get(Metadata.CONTENT_TYPE);
String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
MediaType type = null;
if (mediaType != null) {
type = MediaType.parse(mediaType);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
} else {
throw new TikaException("media type must be set in metadata before parse");
}
StringBuilder out = new StringBuilder();
String line;
int nbLines = 0;
while ((line = reader.readLine()) != null) {
out
.append(line)
.append(System.getProperty("line.separator"));
String author = parserAuthor(line);
if (author != null) {
metadata.add(TikaCoreProperties.CREATOR, author);
}
nbLines++;
}
metadata.set("LoC", String.valueOf(nbLines));
Renderer renderer = getRenderer(type.toString());
String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
Document document = Jsoup.parse(codeAsHtml);
document.quirksMode(Document.QuirksMode.quirks);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context);
xhtml.startDocument();
try {
NodeTraversor.filter(new TikaNodeFilter(xhtml), document);
} catch (RuntimeSAXException e) {
throw e.getWrapped();
} finally {
xhtml.endDocument();
}
} finally {
tis.removeCloseShield();
}
}
private Renderer getRenderer(String mimeType) throws TikaException {
MediaType mt = MediaType.parse(mimeType);
String type = TYPES_TO_RENDERER.get(mt);
if (type == null) {
throw new TikaException("unparseable content type " + mimeType);
}
return XhtmlRendererFactory.getRenderer(type);
}
private String parserAuthor(String line) {
Matcher m = AUTHORPATTERN.matcher(line);
if (m.find()) {
return m
.group(1)
.trim();
}
return null;
}
private static class TikaNodeFilter implements NodeFilter {
boolean ignore = true;
ContentHandler handler;
private TikaNodeFilter(ContentHandler handler) {
this.handler = handler;
}
@Override
public NodeFilter.FilterResult head(Node node, int i) {
//skip document fragment
if ("html".equals(node.nodeName())) {
ignore = false;
}
if (ignore) {
return FilterResult.CONTINUE;
}
if (node instanceof TextNode) {
String txt = ((TextNode) node).getWholeText();
if (txt != null) {
char[] chars = txt.toCharArray();
try {
if (chars.length > 0) {
handler.characters(chars, 0, chars.length);
}
} catch (SAXException e) {
throw new RuntimeSAXException(e);
}
}
return NodeFilter.FilterResult.CONTINUE;
} else if (node instanceof DataNode) {
//maybe handle script data directly here instead of
//passing it through to the HTMLHandler?
String txt = ((DataNode) node).getWholeData();
if (txt != null) {
char[] chars = txt.toCharArray();
try {
if (chars.length > 0) {
handler.characters(chars, 0, chars.length);
}
} catch (SAXException e) {
throw new RuntimeSAXException(e);
}
}
return NodeFilter.FilterResult.CONTINUE;
}
AttributesImpl attributes = new AttributesImpl();
Iterator<Attribute> jsoupAttrs = node
.attributes()
.iterator();
while (jsoupAttrs.hasNext()) {
Attribute jsoupAttr = jsoupAttrs.next();
attributes.addAttribute("", jsoupAttr.getKey(), jsoupAttr.getKey(), "", jsoupAttr.getValue());
}
try {
handler.startElement("", node.nodeName(), node.nodeName(), attributes);
} catch (SAXException e) {
throw new RuntimeSAXException(e);
}
return NodeFilter.FilterResult.CONTINUE;
}
@Override
public NodeFilter.FilterResult tail(Node node, int i) {
if ("html".equals(node.nodeName())) {
ignore = true;
}
if (ignore) {
return FilterResult.CONTINUE;
}
if (node instanceof TextNode || node instanceof DataNode) {
return NodeFilter.FilterResult.CONTINUE;
}
try {
handler.endElement(XMLConstants.NULL_NS_URI, node.nodeName(), node.nodeName());
} catch (SAXException e) {
throw new RuntimeSAXException(e);
}
return NodeFilter.FilterResult.CONTINUE;
}
}
private static class RuntimeSAXException extends RuntimeException {
private SAXException wrapped;
private RuntimeSAXException(SAXException e) {
this.wrapped = e;
}
SAXException getWrapped() {
return wrapped;
}
}
}