MarkdownChunkerTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.inference;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.List;
import org.junit.jupiter.api.Test;
public class MarkdownChunkerTest {
@Test
void testSimpleHeadingSplit() {
String md = "# Heading 1\n\nParagraph one.\n\n# Heading 2\n\nParagraph two.";
MarkdownChunker chunker = new MarkdownChunker(500, 0);
List<Chunk> chunks = chunker.chunk(md);
assertEquals(2, chunks.size());
assertTrue(chunks.get(0).getText().startsWith("# Heading 1"));
assertTrue(chunks.get(1).getText().startsWith("# Heading 2"));
}
@Test
void testOffsets() {
String md = "# A\n\nText A.\n\n# B\n\nText B.";
MarkdownChunker chunker = new MarkdownChunker(500, 0);
List<Chunk> chunks = chunker.chunk(md);
assertEquals(2, chunks.size());
// Verify offsets point into the original string
assertEquals(chunks.get(0).getText(),
md.substring(chunks.get(0).getStartOffset(),
chunks.get(0).getEndOffset()));
assertEquals(chunks.get(1).getText(),
md.substring(chunks.get(1).getStartOffset(),
chunks.get(1).getEndOffset()));
}
@Test
void testParagraphSplitWhenSectionTooLarge() {
// Two paragraphs in one heading section, each ~30 chars
String md = "# Big Section\n\n"
+ "Paragraph one is here now.\n\n"
+ "Paragraph two is here too.";
// Max 50 chars forces a paragraph-level split
MarkdownChunker chunker = new MarkdownChunker(50, 0);
List<Chunk> chunks = chunker.chunk(md);
assertTrue(chunks.size() >= 2,
"Should split at paragraph boundary, got " + chunks.size());
}
@Test
void testHardSplitOnLongParagraph() {
String longPara = "A".repeat(200);
MarkdownChunker chunker = new MarkdownChunker(50, 0);
List<Chunk> chunks = chunker.chunk(longPara);
assertTrue(chunks.size() >= 4,
"200 chars / 50 max = at least 4 chunks");
for (Chunk c : chunks) {
assertTrue(c.getText().length() <= 50);
}
}
@Test
void testOverlap() {
String md = "# A\n\nAAAAAAAAAA\n\n# B\n\nBBBBBBBBBB";
MarkdownChunker chunker = new MarkdownChunker(500, 5);
List<Chunk> chunks = chunker.chunk(md);
assertEquals(2, chunks.size());
// Second chunk should start earlier than the heading boundary
// due to overlap pulling back into the first chunk's text
assertTrue(chunks.get(1).getStartOffset() < md.indexOf("# B"));
}
@Test
void testEmptyInput() {
MarkdownChunker chunker = new MarkdownChunker(500, 0);
assertEquals(0, chunker.chunk("").size());
assertEquals(0, chunker.chunk(null).size());
}
@Test
void testNoHeadings() {
String md = "Just a plain paragraph with no headings at all.";
MarkdownChunker chunker = new MarkdownChunker(500, 0);
List<Chunk> chunks = chunker.chunk(md);
assertEquals(1, chunks.size());
assertEquals(md, chunks.get(0).getText());
}
@Test
void testMultipleLevelHeadings() {
String md = "# H1\n\nText.\n\n## H2\n\nMore text.\n\n### H3\n\nDeep text.";
MarkdownChunker chunker = new MarkdownChunker(500, 0);
List<Chunk> chunks = chunker.chunk(md);
assertEquals(3, chunks.size());
}
@Test
void testInvalidConfig() {
assertThrows(IllegalArgumentException.class,
() -> new MarkdownChunker(0, 0));
assertThrows(IllegalArgumentException.class,
() -> new MarkdownChunker(100, -1));
assertThrows(IllegalArgumentException.class,
() -> new MarkdownChunker(100, 100));
}
@Test
void testSingleSmallChunk() {
String md = "# Title\n\nShort.";
MarkdownChunker chunker = new MarkdownChunker(500, 0);
List<Chunk> chunks = chunker.chunk(md);
assertEquals(1, chunks.size());
assertEquals(0, chunks.get(0).getStartOffset());
}
}