MarkdownToXHTMLEmitter.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.vlm;
import java.util.Arrays;
import java.util.List;
import org.commonmark.Extension;
import org.commonmark.ext.gfm.strikethrough.Strikethrough;
import org.commonmark.ext.gfm.strikethrough.StrikethroughExtension;
import org.commonmark.ext.gfm.tables.TableBlock;
import org.commonmark.ext.gfm.tables.TableBody;
import org.commonmark.ext.gfm.tables.TableCell;
import org.commonmark.ext.gfm.tables.TableHead;
import org.commonmark.ext.gfm.tables.TableRow;
import org.commonmark.ext.gfm.tables.TablesExtension;
import org.commonmark.node.AbstractVisitor;
import org.commonmark.node.BlockQuote;
import org.commonmark.node.BulletList;
import org.commonmark.node.Code;
import org.commonmark.node.Document;
import org.commonmark.node.Emphasis;
import org.commonmark.node.FencedCodeBlock;
import org.commonmark.node.HardLineBreak;
import org.commonmark.node.Heading;
import org.commonmark.node.HtmlBlock;
import org.commonmark.node.HtmlInline;
import org.commonmark.node.Image;
import org.commonmark.node.IndentedCodeBlock;
import org.commonmark.node.Link;
import org.commonmark.node.ListItem;
import org.commonmark.node.Node;
import org.commonmark.node.OrderedList;
import org.commonmark.node.Paragraph;
import org.commonmark.node.SoftLineBreak;
import org.commonmark.node.StrongEmphasis;
import org.commonmark.node.Text;
import org.commonmark.node.ThematicBreak;
import org.commonmark.parser.Parser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
/**
* Parses a markdown string using commonmark-java and emits XHTML SAX events.
* <p>
* Supports:
* <ul>
* <li>Headings ({@code h1}���{@code h6})</li>
* <li>Paragraphs ({@code p})</li>
* <li>Bold / italic / strikethrough ({@code b}, {@code i}, {@code s})</li>
* <li>Links ({@code a}) and images ({@code img})</li>
* <li>Ordered and unordered lists ({@code ol}, {@code ul}, {@code li})</li>
* <li>Blockquotes ({@code blockquote})</li>
* <li>Code blocks ({@code pre}/{@code code}) and inline code ({@code code})</li>
* <li>GFM tables ({@code table}, {@code thead}, {@code tbody}, {@code tr},
* {@code th}, {@code td})</li>
* <li>Thematic breaks ({@code hr})</li>
* <li>Hard / soft line breaks ({@code br})</li>
* </ul>
*
* @since Apache Tika 4.0
*/
class MarkdownToXHTMLEmitter {
private static final List<Extension> EXTENSIONS = Arrays.asList(
TablesExtension.create(),
StrikethroughExtension.create()
);
private static final Parser PARSER = Parser.builder()
.extensions(EXTENSIONS)
.build();
private static final AttributesImpl EMPTY_ATTRS = new AttributesImpl();
/**
* Parses the given markdown text and emits SAX events to the handler.
* <p>
* The caller is responsible for calling {@code startDocument} /
* {@code endDocument} on the handler if desired ��� this method only emits
* the body-level elements.
*
* @param markdown the markdown text to parse
* @param handler the SAX content handler to receive events
* @throws SAXException if the handler throws
*/
static void emit(String markdown, ContentHandler handler) throws SAXException {
if (markdown == null || markdown.isEmpty()) {
return;
}
Node document = PARSER.parse(markdown);
SAXVisitor visitor = new SAXVisitor(handler);
document.accept(visitor);
if (visitor.saxException != null) {
throw visitor.saxException;
}
}
/**
* commonmark AST visitor that fires SAX events for each node.
*/
private static class SAXVisitor extends AbstractVisitor {
private final ContentHandler handler;
SAXException saxException;
SAXVisitor(ContentHandler handler) {
this.handler = handler;
}
// --- block nodes ---
@Override
public void visit(Document document) {
visitChildren(document);
}
@Override
public void visit(Heading heading) {
String tag = "h" + heading.getLevel();
startElement(tag);
visitChildren(heading);
endElement(tag);
}
@Override
public void visit(Paragraph paragraph) {
// Skip wrapping <p> inside list items ��� commonmark wraps
// "loose" list item content in Paragraph nodes, which would
// produce <li><p>text</p></li>. We emit the text directly.
if (paragraph.getParent() instanceof ListItem) {
visitChildren(paragraph);
return;
}
startElement("p");
visitChildren(paragraph);
endElement("p");
}
@Override
public void visit(BlockQuote blockQuote) {
startElement("blockquote");
visitChildren(blockQuote);
endElement("blockquote");
}
@Override
public void visit(BulletList bulletList) {
startElement("ul");
visitChildren(bulletList);
endElement("ul");
}
@Override
public void visit(OrderedList orderedList) {
startElement("ol");
visitChildren(orderedList);
endElement("ol");
}
@Override
public void visit(ListItem listItem) {
startElement("li");
visitChildren(listItem);
endElement("li");
}
@Override
public void visit(FencedCodeBlock fencedCodeBlock) {
AttributesImpl attrs = EMPTY_ATTRS;
String info = fencedCodeBlock.getInfo();
if (info != null && !info.isEmpty()) {
attrs = new AttributesImpl();
attrs.addAttribute("", "class", "class", "CDATA",
"language-" + info.split("\\s+")[0]);
}
startElement("pre");
startElement("code", attrs);
characters(fencedCodeBlock.getLiteral());
endElement("code");
endElement("pre");
}
@Override
public void visit(IndentedCodeBlock indentedCodeBlock) {
startElement("pre");
startElement("code");
characters(indentedCodeBlock.getLiteral());
endElement("code");
endElement("pre");
}
@Override
public void visit(ThematicBreak thematicBreak) {
emptyElement("hr");
}
@Override
public void visit(HtmlBlock htmlBlock) {
// Emit raw HTML content as plain text ��� we don't parse nested HTML
characters(htmlBlock.getLiteral());
}
// --- inline nodes ---
@Override
public void visit(Text text) {
characters(text.getLiteral());
}
@Override
public void visit(StrongEmphasis strongEmphasis) {
startElement("b");
visitChildren(strongEmphasis);
endElement("b");
}
@Override
public void visit(Emphasis emphasis) {
startElement("i");
visitChildren(emphasis);
endElement("i");
}
@Override
public void visit(Code code) {
startElement("code");
characters(code.getLiteral());
endElement("code");
}
@Override
public void visit(Link link) {
AttributesImpl attrs = new AttributesImpl();
attrs.addAttribute("", "href", "href", "CDATA", link.getDestination());
if (link.getTitle() != null && !link.getTitle().isEmpty()) {
attrs.addAttribute("", "title", "title", "CDATA", link.getTitle());
}
startElement("a", attrs);
visitChildren(link);
endElement("a");
}
@Override
public void visit(Image image) {
AttributesImpl attrs = new AttributesImpl();
attrs.addAttribute("", "src", "src", "CDATA", image.getDestination());
if (image.getTitle() != null && !image.getTitle().isEmpty()) {
attrs.addAttribute("", "title", "title", "CDATA", image.getTitle());
}
// Use alt text from child text nodes
StringBuilder alt = new StringBuilder();
Node child = image.getFirstChild();
while (child != null) {
if (child instanceof Text) {
alt.append(((Text) child).getLiteral());
}
child = child.getNext();
}
attrs.addAttribute("", "alt", "alt", "CDATA", alt.toString());
emptyElement("img", attrs);
}
@Override
public void visit(HardLineBreak hardLineBreak) {
emptyElement("br");
}
@Override
public void visit(SoftLineBreak softLineBreak) {
characters(" ");
}
@Override
public void visit(HtmlInline htmlInline) {
// Emit inline HTML as plain text
characters(htmlInline.getLiteral());
}
// --- GFM extensions ---
@Override
public void visit(org.commonmark.node.CustomBlock customBlock) {
if (customBlock instanceof TableBlock) {
startElement("table");
visitChildren(customBlock);
endElement("table");
} else {
visitChildren(customBlock);
}
}
@Override
public void visit(org.commonmark.node.CustomNode customNode) {
if (customNode instanceof TableHead) {
startElement("thead");
visitChildren(customNode);
endElement("thead");
} else if (customNode instanceof TableBody) {
startElement("tbody");
visitChildren(customNode);
endElement("tbody");
} else if (customNode instanceof TableRow) {
startElement("tr");
visitChildren(customNode);
endElement("tr");
} else if (customNode instanceof TableCell) {
TableCell cell = (TableCell) customNode;
String tag = cell.isHeader() ? "th" : "td";
AttributesImpl attrs = EMPTY_ATTRS;
TableCell.Alignment alignment = cell.getAlignment();
if (alignment != null) {
attrs = new AttributesImpl();
String align;
switch (alignment) {
case LEFT:
align = "left";
break;
case CENTER:
align = "center";
break;
case RIGHT:
align = "right";
break;
default:
align = null;
break;
}
if (align != null) {
attrs.addAttribute("", "align", "align", "CDATA", align);
}
}
startElement(tag, attrs);
visitChildren(customNode);
endElement(tag);
} else if (customNode instanceof Strikethrough) {
startElement("s");
visitChildren(customNode);
endElement("s");
} else {
visitChildren(customNode);
}
}
// --- SAX helpers ---
private void startElement(String localName) {
startElement(localName, EMPTY_ATTRS);
}
private void startElement(String localName, AttributesImpl attrs) {
if (saxException != null) {
return;
}
try {
handler.startElement("", localName, localName, attrs);
} catch (SAXException e) {
saxException = e;
}
}
private void endElement(String localName) {
if (saxException != null) {
return;
}
try {
handler.endElement("", localName, localName);
} catch (SAXException e) {
saxException = e;
}
}
private void emptyElement(String localName) {
emptyElement(localName, EMPTY_ATTRS);
}
private void emptyElement(String localName, AttributesImpl attrs) {
startElement(localName, attrs);
endElement(localName);
}
private void characters(String text) {
if (saxException != null || text == null || text.isEmpty()) {
return;
}
try {
char[] chars = text.toCharArray();
handler.characters(chars, 0, chars.length);
} catch (SAXException e) {
saxException = e;
}
}
}
}