PathNormalizationTest.java

/*
 * Copyright 2013 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.common.jimfs;

import static com.google.common.jimfs.PathNormalization.CASE_FOLD_ASCII;
import static com.google.common.jimfs.PathNormalization.CASE_FOLD_UNICODE;
import static com.google.common.jimfs.PathNormalization.NFC;
import static com.google.common.jimfs.PathNormalization.NFD;
import static com.google.common.jimfs.TestUtils.assertNotEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import com.google.common.collect.ImmutableSet;
import java.util.regex.Pattern;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;

/**
 * Tests for {@link PathNormalization}.
 *
 * @author Colin Decker
 */
@RunWith(JUnit4.class)
public class PathNormalizationTest {

  private ImmutableSet<PathNormalization> normalizations;

  @Test
  public void testNone() {
    normalizations = ImmutableSet.of();

    assertNormalizedEqual("foo", "foo");
    assertNormalizedUnequal("Foo", "foo");
    assertNormalizedUnequal("\u00c5", "\u212b");
    assertNormalizedUnequal("Am\u00e9lie", "Ame\u0301lie");
  }

  private static final String[][] CASE_FOLD_TEST_DATA = {
    {"foo", "fOo", "foO", "Foo", "FOO"},
    {"e���cient", "efficient", "e���cient", "E���cient", "EFFICIENT"},
    {"���our", "flour", "���our", "Flour", "FLOUR"},
    {"po��t", "post", "po��t", "Po��t", "POST"},
    {"po���", "post", "po���", "Po���", "POST"},
    {"���op", "stop", "���op", "Stop", "STOP"},
    {"tsch����", "tsch��ss", "tsch����", "Tsch����", "TSCH��SS"},
    {"wei��", "weiss", "wei��", "Wei��", "WEISS"},
    {"WEI���", "weiss", "wei��", "Wei��", "WEI���"},
    {"��������������", "��������������", "��������������", "��������������", "��������������"},
    {"��� ������ ������������", "����� ������ ������������", "��� ������ ������������", "����� ������ ������������", "����� ������ ������������"},
    {"Henry ���", "henry ���", "henry ���", "Henry ���", "HENRY ���"},
    {"I Work At ���", "i work at ���", "i work at ���", "I Work At ���", "I WORK AT ���"},
    {"����������", "����������", "����������", "����������", "����������"},
    {"�����", "�����", "���", "�����", "�����"}
  };

  @Test
  public void testCaseFold() {
    normalizations = ImmutableSet.of(CASE_FOLD_UNICODE);

    for (String[] row : CASE_FOLD_TEST_DATA) {
      for (int i = 0; i < row.length; i++) {
        for (int j = i; j < row.length; j++) {
          assertNormalizedEqual(row[i], row[j]);
        }
      }
    }
  }

  @Test
  public void testCaseInsensitiveAscii() {
    normalizations = ImmutableSet.of(CASE_FOLD_ASCII);

    String[] row = {"foo", "FOO", "fOo", "Foo"};
    for (int i = 0; i < row.length; i++) {
      for (int j = i; j < row.length; j++) {
        assertNormalizedEqual(row[i], row[j]);
      }
    }

    assertNormalizedUnequal("wei��", "weiss");
  }

  private static final String[][] NORMALIZE_TEST_DATA = {
    {"\u00c5", "\u212b"}, // two forms of �� (one code point each)
    {"Am\u00e9lie", "Ame\u0301lie"} // two forms of Am��lie (one composed, one decomposed)
  };

  @Test
  public void testNormalizeNfc() {
    normalizations = ImmutableSet.of(NFC);

    for (String[] row : NORMALIZE_TEST_DATA) {
      for (int i = 0; i < row.length; i++) {
        for (int j = i; j < row.length; j++) {
          assertNormalizedEqual(row[i], row[j]);
        }
      }
    }
  }

  @Test
  public void testNormalizeNfd() {
    normalizations = ImmutableSet.of(NFD);

    for (String[] row : NORMALIZE_TEST_DATA) {
      for (int i = 0; i < row.length; i++) {
        for (int j = i; j < row.length; j++) {
          assertNormalizedEqual(row[i], row[j]);
        }
      }
    }
  }

  private static final String[][] NORMALIZE_CASE_FOLD_TEST_DATA = {
    {"\u00c5", "\u00e5", "\u212b"},
    {"Am\u00e9lie", "Am\u00c9lie", "Ame\u0301lie", "AME\u0301LIE"}
  };

  @Test
  public void testNormalizeNfcCaseFold() {
    normalizations = ImmutableSet.of(NFC, CASE_FOLD_UNICODE);

    for (String[] row : NORMALIZE_CASE_FOLD_TEST_DATA) {
      for (int i = 0; i < row.length; i++) {
        for (int j = i; j < row.length; j++) {
          assertNormalizedEqual(row[i], row[j]);
        }
      }
    }
  }

  @Test
  public void testNormalizeNfdCaseFold() {
    normalizations = ImmutableSet.of(NFD, CASE_FOLD_UNICODE);

    for (String[] row : NORMALIZE_CASE_FOLD_TEST_DATA) {
      for (int i = 0; i < row.length; i++) {
        for (int j = i; j < row.length; j++) {
          assertNormalizedEqual(row[i], row[j]);
        }
      }
    }
  }

  private static final String[][] NORMALIZED_CASE_INSENSITIVE_ASCII_TEST_DATA = {
    {"\u00e5", "\u212b"},
    {"Am\u00e9lie", "AME\u0301LIE"}
  };

  @Test
  public void testNormalizeNfcCaseFoldAscii() {
    normalizations = ImmutableSet.of(NFC, CASE_FOLD_ASCII);

    for (String[] row : NORMALIZED_CASE_INSENSITIVE_ASCII_TEST_DATA) {
      for (int i = 0; i < row.length; i++) {
        for (int j = i + 1; j < row.length; j++) {
          assertNormalizedUnequal(row[i], row[j]);
        }
      }
    }
  }

  @Test
  public void testNormalizeNfdCaseFoldAscii() {
    normalizations = ImmutableSet.of(NFD, CASE_FOLD_ASCII);

    for (String[] row : NORMALIZED_CASE_INSENSITIVE_ASCII_TEST_DATA) {
      for (int i = 0; i < row.length; i++) {
        for (int j = i + 1; j < row.length; j++) {
          // since decomposition happens before case folding, the strings are equal when the
          // decomposed ASCII letter is folded
          assertNormalizedEqual(row[i], row[j]);
        }
      }
    }
  }

  // regex patterns offer loosely similar matching, but that's all

  @Test
  public void testNone_pattern() {
    normalizations = ImmutableSet.of();
    assertNormalizedPatternMatches("foo", "foo");
    assertNormalizedPatternDoesNotMatch("foo", "FOO");
    assertNormalizedPatternDoesNotMatch("FOO", "foo");
  }

  @Test
  public void testCaseFold_pattern() {
    normalizations = ImmutableSet.of(CASE_FOLD_UNICODE);
    assertNormalizedPatternMatches("foo", "foo");
    assertNormalizedPatternMatches("foo", "FOO");
    assertNormalizedPatternMatches("FOO", "foo");
    assertNormalizedPatternMatches("Am\u00e9lie", "AM\u00c9LIE");
    assertNormalizedPatternMatches("Ame\u0301lie", "AME\u0301LIE");
    assertNormalizedPatternDoesNotMatch("Am\u00e9lie", "Ame\u0301lie");
    assertNormalizedPatternDoesNotMatch("AM\u00c9LIE", "AME\u0301LIE");
    assertNormalizedPatternDoesNotMatch("Am\u00e9lie", "AME\u0301LIE");
  }

  @Test
  public void testCaseFoldAscii_pattern() {
    normalizations = ImmutableSet.of(CASE_FOLD_ASCII);
    assertNormalizedPatternMatches("foo", "foo");
    assertNormalizedPatternMatches("foo", "FOO");
    assertNormalizedPatternMatches("FOO", "foo");
    assertNormalizedPatternMatches("Ame\u0301lie", "AME\u0301LIE");
    assertNormalizedPatternDoesNotMatch("Am\u00e9lie", "AM\u00c9LIE");
    assertNormalizedPatternDoesNotMatch("Am\u00e9lie", "Ame\u0301lie");
    assertNormalizedPatternDoesNotMatch("AM\u00c9LIE", "AME\u0301LIE");
    assertNormalizedPatternDoesNotMatch("Am\u00e9lie", "AME\u0301LIE");
  }

  @Test
  public void testNormalizeNfc_pattern() {
    normalizations = ImmutableSet.of(NFC);
    assertNormalizedPatternMatches("foo", "foo");
    assertNormalizedPatternDoesNotMatch("foo", "FOO");
    assertNormalizedPatternDoesNotMatch("FOO", "foo");
    assertNormalizedPatternMatches("Am\u00e9lie", "Ame\u0301lie");
    assertNormalizedPatternDoesNotMatch("Am\u00e9lie", "AME\u0301LIE");
  }

  @Test
  public void testNormalizeNfd_pattern() {
    normalizations = ImmutableSet.of(NFD);
    assertNormalizedPatternMatches("foo", "foo");
    assertNormalizedPatternDoesNotMatch("foo", "FOO");
    assertNormalizedPatternDoesNotMatch("FOO", "foo");
    assertNormalizedPatternMatches("Am\u00e9lie", "Ame\u0301lie");
    assertNormalizedPatternDoesNotMatch("Am\u00e9lie", "AME\u0301LIE");
  }

  @Test
  public void testNormalizeNfcCaseFold_pattern() {
    normalizations = ImmutableSet.of(NFC, CASE_FOLD_UNICODE);
    assertNormalizedPatternMatches("foo", "foo");
    assertNormalizedPatternMatches("foo", "FOO");
    assertNormalizedPatternMatches("FOO", "foo");
    assertNormalizedPatternMatches("Am\u00e9lie", "AM\u00c9LIE");
    assertNormalizedPatternMatches("Ame\u0301lie", "AME\u0301LIE");
    assertNormalizedPatternMatches("Am\u00e9lie", "Ame\u0301lie");
    assertNormalizedPatternMatches("AM\u00c9LIE", "AME\u0301LIE");
    assertNormalizedPatternMatches("Am\u00e9lie", "AME\u0301LIE");
  }

  @Test
  public void testNormalizeNfdCaseFold_pattern() {
    normalizations = ImmutableSet.of(NFD, CASE_FOLD_UNICODE);
    assertNormalizedPatternMatches("foo", "foo");
    assertNormalizedPatternMatches("foo", "FOO");
    assertNormalizedPatternMatches("FOO", "foo");
    assertNormalizedPatternMatches("Am\u00e9lie", "AM\u00c9LIE");
    assertNormalizedPatternMatches("Ame\u0301lie", "AME\u0301LIE");
    assertNormalizedPatternMatches("Am\u00e9lie", "Ame\u0301lie");
    assertNormalizedPatternMatches("AM\u00c9LIE", "AME\u0301LIE");
    assertNormalizedPatternMatches("Am\u00e9lie", "AME\u0301LIE");
  }

  @Test
  public void testNormalizeNfcCaseFoldAscii_pattern() {
    normalizations = ImmutableSet.of(NFC, CASE_FOLD_ASCII);
    assertNormalizedPatternMatches("foo", "foo");
    assertNormalizedPatternMatches("foo", "FOO");
    assertNormalizedPatternMatches("FOO", "foo");

    // these are all a bit fuzzy as when CASE_INSENSITIVE is present but not UNICODE_CASE, ASCII
    // only strings are expected
    assertNormalizedPatternMatches("Ame\u0301lie", "AME\u0301LIE");
    assertNormalizedPatternDoesNotMatch("Am\u00e9lie", "AM\u00c9LIE");
    assertNormalizedPatternMatches("Am\u00e9lie", "Ame\u0301lie");
    assertNormalizedPatternMatches("AM\u00c9LIE", "AME\u0301LIE");
  }

  @Test
  public void testNormalizeNfdCaseFoldAscii_pattern() {
    normalizations = ImmutableSet.of(NFD, CASE_FOLD_ASCII);
    assertNormalizedPatternMatches("foo", "foo");
    assertNormalizedPatternMatches("foo", "FOO");
    assertNormalizedPatternMatches("FOO", "foo");

    // these are all a bit fuzzy as when CASE_INSENSITIVE is present but not UNICODE_CASE, ASCII
    // only strings are expected
    assertNormalizedPatternMatches("Ame\u0301lie", "AME\u0301LIE");
    assertNormalizedPatternDoesNotMatch("Am\u00e9lie", "AM\u00c9LIE");
    assertNormalizedPatternMatches("Am\u00e9lie", "Ame\u0301lie");
    assertNormalizedPatternMatches("AM\u00c9LIE", "AME\u0301LIE");
  }

  /** Asserts that the given strings normalize to the same string using the current normalizer. */
  private void assertNormalizedEqual(String first, String second) {
    assertEquals(
        PathNormalization.normalize(first, normalizations),
        PathNormalization.normalize(second, normalizations));
  }

  /** Asserts that the given strings normalize to different strings using the current normalizer. */
  private void assertNormalizedUnequal(String first, String second) {
    assertNotEquals(
        PathNormalization.normalize(first, normalizations),
        PathNormalization.normalize(second, normalizations));
  }

  /**
   * Asserts that the given strings match when one is compiled as a regex pattern using the current
   * normalizer and matched against the other.
   */
  private void assertNormalizedPatternMatches(String first, String second) {
    Pattern pattern = PathNormalization.compilePattern(first, normalizations);
    assertTrue(
        "pattern '" + pattern + "' does not match '" + second + "'",
        pattern.matcher(second).matches());

    pattern = PathNormalization.compilePattern(second, normalizations);
    assertTrue(
        "pattern '" + pattern + "' does not match '" + first + "'",
        pattern.matcher(first).matches());
  }

  /**
   * Asserts that the given strings do not match when one is compiled as a regex pattern using the
   * current normalizer and matched against the other.
   */
  private void assertNormalizedPatternDoesNotMatch(String first, String second) {
    Pattern pattern = PathNormalization.compilePattern(first, normalizations);
    assertFalse(
        "pattern '" + pattern + "' should not match '" + second + "'",
        pattern.matcher(second).matches());

    pattern = PathNormalization.compilePattern(second, normalizations);
    assertFalse(
        "pattern '" + pattern + "' should not match '" + first + "'",
        pattern.matcher(first).matches());
  }
}