HtmlEncodingDetectorTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.html;


import static org.junit.jupiter.api.Assertions.assertEquals;

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.List;

import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;

import org.apache.tika.detect.EncodingResult;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;

public class HtmlEncodingDetectorTest {

    @Test
    public void basic() throws IOException {
        assertWindows1252("<meta charset='WINDOWS-1252'>");
    }

    @Test
    @Disabled("can we can prove this harms detection")
    public void utf16() throws IOException {
        // According to the specification 'If charset is a UTF-16 encoding,
        // then set charset to UTF-8.'
        assertCharset("<meta charset='UTF-16BE'>", StandardCharsets.UTF_8);
    }

    @Test
    public void xUserDefined() throws IOException {
        // According to the specification 'If charset is x-user-defined,
        // then set charset to windows-1252.'
        assertWindows1252("<meta charset='x-user-defined'>");
    }

    @Test
    public void withSlash() throws IOException {
        assertWindows1252("<meta/charset='WINDOWS-1252'>");
    }

    @Test
    @Disabled("until we do a full parse")
    public void insideTag() throws IOException {
        assertWindows1252("<meta name='description'" +
                "content='If I write charset=UTF-8 here, it doesnt mean the page is in UTF-8'/>" +
                "<meta charset='WINDOWS-1252'>");
    }

    @Test
    @Disabled("until we do a full parse")
    public void missingAttribute() throws IOException {
        assertWindows1252("<meta content='charset=UTF-8'>" + // missing http-equiv attribute
                "<meta charset='WINDOWS-1252'>" // valid declaration
        );
    }

    @Test
    @Disabled("until we do a full parse")
    public void insideSpecialTag() throws IOException {
        // Content inside <?, <!, and </ should be ignored
        for (byte b : "?!/".getBytes(StandardCharsets.US_ASCII))
            assertWindows1252("<" + (char) b + // start comment
                    "<meta charset='UTF-8'>" + // inside special tag
                    "<meta charset='WINDOWS-1252'>" // real charset declaration
            );
    }

    @Test
    @Disabled("until we can prove this harms detection")
    public void spaceBeforeTag() throws IOException {
        assertWindows1252("< meta charset='UTF-8'>" + // invalid charset declaration
                "<meta charset='WINDOWS-1252'>" // real charset declaration
        );
    }

    @Test
    public void invalidAttribute() throws IOException {
        assertWindows1252("<meta " + "badcharset='UTF-8' " + // invalid charset declaration
                "charset='WINDOWS-1252'>" // real charset declaration
        );
    }

    @Test
    @Disabled("until we can prove this harms detection")
    public void unmatchedQuote() throws IOException {
        assertWindows1252("<meta http-equiv='content-type' content='charset=\"UTF-8'>" +
                // invalid charset declaration
                "<meta charset='WINDOWS-1252'>" // real charset declaration
        );
    }


    @Test
    @Disabled("until we do a full parse")
    public void withCompactComment() throws IOException {
        // <!--> is a valid comment
        assertWindows1252("<!--" + // start comment
                "<meta charset='UTF-8'>" + // inside comment
                "-->" + // end comment
                "<!-->" + // compact comment
                "<meta charset='WINDOWS-1252'>" // outside comment, charset declaration
        );
    }

    private void assertWindows1252(String html) throws IOException {
        assertCharset(html, Charset.forName("WINDOWS-1252"));
    }

    private void assertCharset(String html, Charset charset) throws IOException {
        assertEquals(charset, detectCharset(html),
                html + " should be detected as " + charset);
    }

    private Charset detectCharset(String test) throws IOException {
        Metadata metadata = new Metadata();
        try (TikaInputStream tis = TikaInputStream.get(test.getBytes(StandardCharsets.UTF_8))) {
            List<EncodingResult> results =
                    new HtmlEncodingDetector().detect(tis, metadata, new ParseContext());
            return results.isEmpty() ? null : results.get(0).getCharset();
        }
    }
}