ToMarkdownContentHandler.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.sax;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Deque;
import java.util.List;
import java.util.Locale;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* SAX event handler that writes content as Markdown.
* Supports headings, paragraphs, bold, italic, links, images, lists (ordered
* and unordered, including nested), tables (GFM pipe tables), code blocks,
* inline code, blockquotes, horizontal rules, and definition lists.
* <p>
* Content within <script> and <style> tags is ignored.
* </p>
*
* @since Apache Tika 3.2
*/
public class ToMarkdownContentHandler extends DefaultHandler {
private static final String STYLE = "STYLE";
private static final String SCRIPT = "SCRIPT";
private final Writer writer;
private final Deque<String> elementStack = new ArrayDeque<>();
private final Deque<ListState> listStack = new ArrayDeque<>();
// Link buffering
private StringBuilder linkText;
private String linkHref;
// Table buffering (only the outermost table is rendered; nested tables are ignored)
private int tableDepth = 0;
private List<List<String>> tableRows;
private List<String> currentRow;
private StringBuilder currentCell;
// Blockquote
private int blockquoteDepth = 0;
// Code
private boolean inPreBlock = false;
private boolean inInlineCode = false;
// Script/style suppression
private int scriptDepth = 0;
private int styleDepth = 0;
// Spacing
private boolean needsBlockSeparator = false;
private boolean atLineStart = true;
// Track if we've written any content at all
private boolean hasContent = false;
// Track if meaningful (non-whitespace) content was written since last block separator
private boolean hasContentSinceLastSeparator = false;
public ToMarkdownContentHandler(Writer writer) {
this.writer = writer;
}
public ToMarkdownContentHandler(OutputStream stream, String encoding)
throws UnsupportedEncodingException {
this(new OutputStreamWriter(stream, encoding));
}
public ToMarkdownContentHandler() {
this(new StringWriter());
}
@Override
public void startElement(String uri, String localName, String qName, Attributes atts)
throws SAXException {
String name = localName(localName, qName);
// Track script/style depth
if (name.equals("script")) {
scriptDepth++;
elementStack.push(name);
return;
}
if (name.equals("style")) {
styleDepth++;
elementStack.push(name);
return;
}
if (scriptDepth > 0 || styleDepth > 0) {
elementStack.push(name);
return;
}
elementStack.push(name);
switch (name) {
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
emitBlockSeparator();
int level = name.charAt(1) - '0';
write(repeatChar('#', level) + " ");
break;
case "p":
emitBlockSeparator();
break;
case "b":
case "strong":
write("**");
break;
case "i":
case "em":
write("*");
break;
case "a":
linkHref = atts.getValue("href");
linkText = new StringBuilder();
break;
case "img":
String alt = atts.getValue("alt");
String src = atts.getValue("src");
write(" + ")");
break;
case "ul":
case "ol":
if (!listStack.isEmpty()) {
// nested list ��� no extra block separator
} else {
emitBlockSeparator();
}
listStack.push(new ListState(name.equals("ol"), listStack.size()));
break;
case "li":
if (!listStack.isEmpty()) {
ListState state = listStack.peek();
String indent = repeatChar(' ', state.depth * 4);
if (state.ordered) {
state.counter++;
write(indent + state.counter + ". ");
} else {
write(indent + "- ");
}
}
break;
case "blockquote":
emitBlockSeparator();
blockquoteDepth++;
break;
case "pre":
emitBlockSeparator();
inPreBlock = true;
write("```\n");
break;
case "code":
if (!inPreBlock) {
inInlineCode = true;
write("`");
}
break;
case "br":
write("\n");
atLineStart = true;
break;
case "hr":
emitBlockSeparator();
write("---");
needsBlockSeparator = true;
hasContent = true;
break;
case "table":
tableDepth++;
if (tableDepth == 1) {
emitBlockSeparator();
tableRows = new ArrayList<>();
}
break;
case "tr":
if (tableDepth == 1 && tableRows != null) {
currentRow = new ArrayList<>();
}
break;
case "th":
if (tableDepth == 1 && currentRow != null) {
currentCell = new StringBuilder();
}
break;
case "td":
if (tableDepth == 1 && currentRow != null) {
currentCell = new StringBuilder();
}
break;
case "dt":
emitBlockSeparator();
write("**");
break;
case "dd":
write("\n: ");
break;
case "div":
emitBlockSeparator();
break;
default:
// Ignore structural elements like html, head, body, title, meta
break;
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
String name = localName(localName, qName);
if (!elementStack.isEmpty()) {
elementStack.pop();
}
// Track script/style depth
if (name.equals("script")) {
scriptDepth--;
return;
}
if (name.equals("style")) {
styleDepth--;
return;
}
if (scriptDepth > 0 || styleDepth > 0) {
return;
}
switch (name) {
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
needsBlockSeparator = true;
hasContent = true;
break;
case "p":
needsBlockSeparator = true;
hasContent = true;
break;
case "b":
case "strong":
write("**");
break;
case "i":
case "em":
write("*");
break;
case "a":
if (linkText != null) {
String text = linkText.toString();
String href = linkHref != null ? linkHref : "";
write("[" + text + "](" + href + ")");
linkText = null;
linkHref = null;
}
break;
case "ul":
case "ol":
if (!listStack.isEmpty()) {
listStack.pop();
}
if (listStack.isEmpty()) {
needsBlockSeparator = true;
hasContent = true;
}
break;
case "li":
write("\n");
atLineStart = true;
break;
case "blockquote":
blockquoteDepth--;
needsBlockSeparator = true;
hasContent = true;
break;
case "pre":
if (!endsWithNewline()) {
write("\n");
}
write("```");
inPreBlock = false;
needsBlockSeparator = true;
hasContent = true;
break;
case "code":
if (!inPreBlock) {
inInlineCode = false;
write("`");
}
break;
case "table":
if (tableDepth == 1) {
emitTable();
tableRows = null;
currentRow = null;
currentCell = null;
needsBlockSeparator = true;
hasContent = true;
}
tableDepth = Math.max(0, tableDepth - 1);
break;
case "tr":
if (tableDepth == 1 && tableRows != null && currentRow != null) {
tableRows.add(currentRow);
currentRow = null;
}
break;
case "th":
case "td":
if (tableDepth == 1 && currentRow != null && currentCell != null) {
currentRow.add(currentCell.toString().trim());
currentCell = null;
}
break;
case "dt":
write("**");
break;
case "dd":
needsBlockSeparator = true;
hasContent = true;
break;
case "div":
needsBlockSeparator = true;
hasContent = true;
break;
default:
break;
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (scriptDepth > 0 || styleDepth > 0) {
return;
}
// Buffer into link text
if (linkText != null) {
linkText.append(ch, start, length);
return;
}
// Buffer into table cell
if (currentCell != null) {
currentCell.append(ch, start, length);
return;
}
String text = new String(ch, start, length);
// In pre blocks, write raw (no escaping)
if (inPreBlock) {
write(text);
return;
}
// In inline code, write raw (no escaping)
if (inInlineCode) {
write(text);
return;
}
// Skip whitespace-only text at line start; preserve inline spaces
if (text.trim().isEmpty()) {
if (!atLineStart) {
write(" ");
}
return;
}
// Escape markdown special characters in normal text
text = escapeMarkdown(text);
// Add blockquote prefix if needed at line start
if (blockquoteDepth > 0 && atLineStart && !text.isEmpty()) {
write(repeatChar('>', blockquoteDepth) + " ");
atLineStart = false;
}
if (!text.isEmpty()) {
write(text);
hasContent = true;
hasContentSinceLastSeparator = true;
}
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
characters(ch, start, length);
}
@Override
public void endDocument() throws SAXException {
try {
writer.flush();
} catch (IOException e) {
throw new SAXException("Error flushing character output", e);
}
}
@Override
public String toString() {
return writer.toString();
}
private void write(String s) throws SAXException {
try {
writer.write(s);
if (!s.isEmpty()) {
atLineStart = s.charAt(s.length() - 1) == '\n';
if (!s.trim().isEmpty()) {
hasContentSinceLastSeparator = true;
}
}
} catch (IOException e) {
throw new SAXException("Error writing: " + s, e);
}
}
private void emitBlockSeparator() throws SAXException {
if (needsBlockSeparator && hasContent && hasContentSinceLastSeparator) {
write("\n\n");
needsBlockSeparator = false;
atLineStart = true;
hasContentSinceLastSeparator = false;
} else {
needsBlockSeparator = false;
}
}
private void emitTable() throws SAXException {
if (tableRows == null || tableRows.isEmpty()) {
return;
}
// Determine column count
int cols = 0;
for (List<String> row : tableRows) {
cols = Math.max(cols, row.size());
}
// Emit rows
for (int r = 0; r < tableRows.size(); r++) {
List<String> row = tableRows.get(r);
StringBuilder sb = new StringBuilder("|");
for (int c = 0; c < cols; c++) {
String cell = c < row.size() ? row.get(c) : "";
sb.append(" ").append(cell).append(" |");
}
write(sb.toString());
write("\n");
// Insert separator after first row
if (r == 0) {
StringBuilder sep = new StringBuilder("|");
for (int c = 0; c < cols; c++) {
sep.append(" --- |");
}
write(sep.toString());
write("\n");
}
}
}
private boolean endsWithNewline() {
String s = writer.toString();
return !s.isEmpty() && s.charAt(s.length() - 1) == '\n';
}
private static String escapeMarkdown(String text) {
StringBuilder sb = new StringBuilder(text.length());
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
switch (c) {
case '\\':
case '`':
case '*':
case '_':
case '[':
case ']':
case '#':
case '|':
sb.append('\\').append(c);
break;
default:
sb.append(c);
break;
}
}
return sb.toString();
}
private static String repeatChar(char c, int count) {
StringBuilder sb = new StringBuilder(count);
for (int i = 0; i < count; i++) {
sb.append(c);
}
return sb.toString();
}
private static String localName(String localName, String qName) {
if (localName != null && !localName.isEmpty()) {
return localName.toLowerCase(Locale.ROOT);
}
if (qName != null) {
// Strip namespace prefix
int colon = qName.indexOf(':');
String name = colon >= 0 ? qName.substring(colon + 1) : qName;
return name.toLowerCase(Locale.ROOT);
}
return "";
}
private static class ListState {
final boolean ordered;
final int depth;
int counter;
ListState(boolean ordered, int depth) {
this.ordered = ordered;
this.depth = depth;
this.counter = 0;
}
}
}