PathSelector.java
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.maven.impl;
import java.io.File;
import java.nio.file.FileSystem;
import java.nio.file.Path;
import java.nio.file.PathMatcher;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import org.apache.maven.api.annotations.Nonnull;
/**
* Determines whether a path is selected according to include/exclude patterns.
* The pathnames used for method parameters will be relative to some base directory
* and use {@code '/'} as separator, regardless of the hosting operating system.
*
* <h2>Syntax</h2>
* If a pattern contains the {@code ':'} character and the prefix before is longer than 1 character,
* then that pattern is given verbatim to {@link FileSystem#getPathMatcher(String)}, which interprets
* the part before {@code ':'} as the syntax (usually {@code "glob"} or {@code "regex"}).
* If a pattern does not contain the {@code ':'} character, or if the prefix is one character long
* (interpreted as a Windows drive), then the syntax defaults to a reproduction of the Maven 3 behavior.
* This is implemented as the {@code "glob"} syntax with the following modifications:
*
* <ul>
* <li>The platform-specific separator ({@code '\\'} on Windows) is replaced by {@code '/'}.
* Note that it means that the backslash cannot be used for escaping characters.</li>
* <li>Trailing {@code "/"} is completed as {@code "/**"}.</li>
* <li>The {@code "**"} wildcard means "0 or more directories" instead of "1 or more directories".
* This is implemented by adding variants of the pattern without the {@code "**"} wildcard.</li>
* <li>Bracket characters [ ] and { } are escaped.</li>
* <li>On Unix only, the escape character {@code '\\'} is itself escaped.</li>
* </ul>
*
* If above changes are not desired, put an explicit {@code "glob:"} prefix before the pattern.
* Note that putting such a prefix is recommended anyway for better performances.
*
* @see java.nio.file.FileSystem#getPathMatcher(String)
*/
final class PathSelector implements PathMatcher {
/**
* Patterns which should be excluded by default, like <abbr>SCM</abbr> files.
*
* <p><b>Source:</b> this list is copied from {@code plexus-utils-4.0.2} (released in
* September 23, 2024), class {@code org.codehaus.plexus.util.AbstractScanner}.</p>
*/
private static final List<String> DEFAULT_EXCLUDES = List.of(
// Miscellaneous typical temporary files
"**/*~",
"**/#*#",
"**/.#*",
"**/%*%",
"**/._*",
// CVS
"**/CVS",
"**/CVS/**",
"**/.cvsignore",
// RCS
"**/RCS",
"**/RCS/**",
// SCCS
"**/SCCS",
"**/SCCS/**",
// Visual SourceSafe
"**/vssver.scc",
// MKS
"**/project.pj",
// Subversion
"**/.svn",
"**/.svn/**",
// Arch
"**/.arch-ids",
"**/.arch-ids/**",
// Bazaar
"**/.bzr",
"**/.bzr/**",
// SurroundSCM
"**/.MySCMServerInfo",
// Mac
"**/.DS_Store",
// Serena Dimensions Version 10
"**/.metadata",
"**/.metadata/**",
// Mercurial
"**/.hg",
"**/.hg/**",
// git
"**/.git",
"**/.git/**",
"**/.gitignore",
// BitKeeper
"**/BitKeeper",
"**/BitKeeper/**",
"**/ChangeSet",
"**/ChangeSet/**",
// darcs
"**/_darcs",
"**/_darcs/**",
"**/.darcsrepo",
"**/.darcsrepo/**",
"**/-darcs-backup*",
"**/.darcs-temp-mail");
/**
* Maximum number of characters of the prefix before {@code ':'} for handling as a Maven syntax.
*/
private static final int MAVEN_SYNTAX_THRESHOLD = 1;
/**
* The default syntax to use if none was specified. Note that when this default syntax is applied,
* the user-provided pattern get some changes as documented in class Javadoc.
*/
private static final String DEFAULT_SYNTAX = "glob:";
/**
* Characters having a special meaning in the glob syntax.
*
* @see FileSystem#getPathMatcher(String)
*/
private static final String SPECIAL_CHARACTERS = "*?[]{}\\";
/**
* A path matcher which accepts all files.
*
* @see #simplify()
*/
static final PathMatcher INCLUDES_ALL = (path) -> true;
/**
* String representations of the normalized include filters.
* Each pattern shall be prefixed by its syntax, which is {@value #DEFAULT_SYNTAX} by default.
* An empty array means to include all files.
*
* @see #toString()
*/
private final String[] includePatterns;
/**
* String representations of the normalized exclude filters.
* Each pattern shall be prefixed by its syntax. If no syntax is specified,
* the default is a Maven 3 syntax similar, but not identical, to {@value #DEFAULT_SYNTAX}.
* This array may be longer or shorter than the user-supplied excludes, depending on whether
* default excludes have been added and whether some unnecessary excludes have been omitted.
*
* @see #toString()
*/
private final String[] excludePatterns;
/**
* The matcher for includes. The length of this array is equal to {@link #includePatterns} array length.
* An empty array means to include all files.
*/
private final PathMatcher[] includes;
/**
* The matcher for excludes. The length of this array is equal to {@link #excludePatterns} array length.
*/
private final PathMatcher[] excludes;
/**
* The matcher for all directories to include. This array includes the parents of all those directories,
* because they need to be accepted before we can walk to the sub-directories.
* This is an optimization for skipping whole directories when possible.
* An empty array means to include all directories.
*/
private final PathMatcher[] dirIncludes;
/**
* The matcher for directories to exclude. This array does <em>not</em> include the parent directories,
* because they may contain other sub-trees that need to be included.
* This is an optimization for skipping whole directories when possible.
*/
private final PathMatcher[] dirExcludes;
/**
* The base directory. All files will be relativized to that directory before to be matched.
*/
private final Path baseDirectory;
/**
* Whether paths must be relativized before being given to a matcher. If {@code true}, then every paths
* will be made relative to {@link #baseDirectory} for allowing patterns like {@code "foo/bar/*.java"}
* to work. As a slight optimization, we can skip this step if all patterns start with {@code "**"}.
*/
private final boolean needRelativize;
/**
* Creates a new selector from the given includes and excludes.
*
* @param directory the base directory of the files to filter
* @param includes the patterns of the files to include, or null or empty for including all files
* @param excludes the patterns of the files to exclude, or null or empty for no exclusion
* @param useDefaultExcludes whether to augment the excludes with a default set of <abbr>SCM</abbr> patterns
* @throws NullPointerException if directory is null
*/
private PathSelector(
@Nonnull Path directory,
Collection<String> includes,
Collection<String> excludes,
boolean useDefaultExcludes) {
baseDirectory = Objects.requireNonNull(directory, "directory cannot be null");
includePatterns = normalizePatterns(includes, false);
excludePatterns = normalizePatterns(effectiveExcludes(excludes, includePatterns, useDefaultExcludes), true);
FileSystem fileSystem = baseDirectory.getFileSystem();
this.includes = matchers(fileSystem, includePatterns);
this.excludes = matchers(fileSystem, excludePatterns);
dirIncludes = matchers(fileSystem, directoryPatterns(includePatterns, false));
dirExcludes = matchers(fileSystem, directoryPatterns(excludePatterns, true));
needRelativize = needRelativize(includePatterns) || needRelativize(excludePatterns);
}
/**
* Creates a new matcher from the given includes and excludes.
*
* @param directory the base directory of the files to filter
* @param includes the patterns of the files to include, or null or empty for including all files
* @param excludes the patterns of the files to exclude, or null or empty for no exclusion
* @param useDefaultExcludes whether to augment the excludes with a default set of <abbr>SCM</abbr> patterns
* @throws NullPointerException if directory is null
* @return a path matcher for the given includes and excludes
*/
public static PathMatcher of(
@Nonnull Path directory,
Collection<String> includes,
Collection<String> excludes,
boolean useDefaultExcludes) {
return new PathSelector(directory, includes, excludes, useDefaultExcludes).simplify();
}
/**
* Returns the given array of excludes, optionally expanded with a default set of excludes,
* then with unnecessary excludes omitted. An unnecessary exclude is an exclude which will never
* match a file because there are no includes which would accept a file that could match the exclude.
* For example, if the only include is {@code "*.java"}, then the <code>"**/project.pj"</code>,
* <code>"**/.DS_Store"</code> and other excludes will never match a file and can be omitted.
* Because the list of {@linkplain #DEFAULT_EXCLUDES default excludes} contains many elements,
* removing unnecessary excludes can reduce a lot the number of matches tested on each source file.
*
* <h4>Implementation note</h4>
* The removal of unnecessary excludes is done on a best effort basis. The current implementation
* compares only the prefixes and suffixes of each pattern, keeping the pattern in case of doubt.
* This is not bad, but it does not remove all unnecessary patterns. It would be possible to do
* better in the future if benchmarking suggests that it would be worth the effort.
*
* @param excludes the user-specified excludes, potentially not yet converted to glob syntax
* @param includes the include patterns converted to glob syntax
* @param useDefaultExcludes whether to expand user exclude with the set of default excludes
* @return the potentially expanded or reduced set of excludes to use
*/
private static Collection<String> effectiveExcludes(
Collection<String> excludes, final String[] includes, final boolean useDefaultExcludes) {
if (excludes == null || excludes.isEmpty()) {
if (useDefaultExcludes) {
excludes = new ArrayList<>(DEFAULT_EXCLUDES);
} else {
return List.of();
}
} else {
excludes = new ArrayList<>(excludes);
excludes.removeIf(Objects::isNull);
if (useDefaultExcludes) {
excludes.addAll(DEFAULT_EXCLUDES);
}
}
if (includes.length == 0) {
return excludes;
}
/*
* Get the prefixes and suffixes of all includes, stopping at the first special character.
* Redundant prefixes and suffixes are omitted.
*/
var prefixes = new String[includes.length];
var suffixes = new String[includes.length];
for (int i = 0; i < includes.length; i++) {
String include = includes[i];
if (!include.startsWith(DEFAULT_SYNTAX)) {
return excludes; // Do not filter if at least one pattern is too complicated.
}
include = include.substring(DEFAULT_SYNTAX.length());
prefixes[i] = prefixOrSuffix(include, false);
suffixes[i] = prefixOrSuffix(include, true);
}
prefixes = sortByLength(prefixes, false);
suffixes = sortByLength(suffixes, true);
/*
* Keep only the exclude which start with one of the prefixes and end with one of the suffixes.
* Note that a prefix or suffix may be the empty string, which match everything.
*/
final Iterator<String> it = excludes.iterator();
nextExclude:
while (it.hasNext()) {
final String exclude = it.next();
final int s = exclude.indexOf(':');
if (s <= MAVEN_SYNTAX_THRESHOLD || exclude.startsWith(DEFAULT_SYNTAX)) {
if (cannotMatch(exclude, prefixes, false) || cannotMatch(exclude, suffixes, true)) {
it.remove();
}
}
}
return excludes;
}
/**
* Returns the maximal amount of ordinary characters at the beginning or end of the given pattern.
* The prefix or suffix stops at the first {@linkplain #SPECIAL_CHARACTERS special character}.
*
* @param include the pattern for which to get a prefix or suffix without special character
* @param suffix {@code false} if a prefix is desired, or {@code true} if a suffix is desired
*/
private static String prefixOrSuffix(final String include, boolean suffix) {
int s = suffix ? -1 : include.length();
for (int i = SPECIAL_CHARACTERS.length(); --i >= 0; ) {
char c = SPECIAL_CHARACTERS.charAt(i);
if (suffix) {
s = Math.max(s, include.lastIndexOf(c));
} else {
int p = include.indexOf(c);
if (p >= 0 && p < s) {
s = p;
}
}
}
return suffix ? include.substring(s + 1) : include.substring(0, s);
}
/**
* Returns {@code true} if the given exclude cannot match any include patterns.
* In case of doubt, returns {@code false}.
*
* @param exclude the exclude pattern to test
* @param fragments the prefixes or suffixes (fragments without special characters) of the includes
* @param suffix {@code false} if the specified fragments are prefixes, {@code true} if they are suffixes
* @return {@code true} if it is certain that the exclude pattern cannot match, or {@code false} in case of doubt
*/
private static boolean cannotMatch(String exclude, final String[] fragments, final boolean suffix) {
exclude = prefixOrSuffix(exclude, suffix);
for (String fragment : fragments) {
int fg = fragment.length();
int ex = exclude.length();
int length = Math.min(fg, ex);
if (suffix) {
fg -= length;
ex -= length;
} else {
fg = 0;
ex = 0;
}
if (exclude.regionMatches(ex, fragment, fg, length)) {
return false;
}
}
return true;
}
/**
* Sorts the given patterns by their length. The main intent is to have the empty string first,
* while will cause the loops testing for prefixes and suffixes to stop almost immediately.
* Short prefixes or suffixes are also more likely to be matched.
*
* @param fragments the fragments to sort in-place
* @param suffix {@code false} if the specified fragments are prefixes, {@code true} if they are suffixes
* @return the given array, or a smaller array if some fragments were discarded because redundant
*/
private static String[] sortByLength(final String[] fragments, final boolean suffix) {
Arrays.sort(fragments, (s1, s2) -> s1.length() - s2.length());
int count = 0;
/*
* Simplify the array of prefixes or suffixes by removing all redundant elements.
* An element is redundant if there is a shorter prefix or suffix with the same characters.
*/
nextBase:
for (String fragment : fragments) {
for (int i = count; --i >= 0; ) {
String base = fragments[i];
if (suffix ? fragment.endsWith(base) : fragment.startsWith(base)) {
continue nextBase; // Skip this fragment
}
}
fragments[count++] = fragment;
}
return (fragments.length == count) ? fragments : Arrays.copyOf(fragments, count);
}
/**
* Returns the given array of patterns with path separator normalized to {@code '/'}.
* Null or empty patterns are ignored, and duplications are removed.
*
* @param patterns the patterns to normalize
* @param excludes whether the patterns are exclude patterns
* @return normalized patterns without null, empty or duplicated patterns
*/
private static String[] normalizePatterns(final Collection<String> patterns, final boolean excludes) {
if (patterns == null || patterns.isEmpty()) {
return new String[0];
}
// TODO: use `LinkedHashSet.newLinkedHashSet(int)` instead with JDK19.
final var normalized = new LinkedHashSet<String>(patterns.size());
for (String pattern : patterns) {
if (pattern != null && !pattern.isEmpty()) {
if (pattern.indexOf(':') <= MAVEN_SYNTAX_THRESHOLD) {
pattern = pattern.replace(File.separatorChar, '/');
if (pattern.endsWith("/")) {
pattern += "**";
}
// Following are okay only when "**" means "0 or more directories".
while (pattern.endsWith("/**/**")) {
pattern = pattern.substring(0, pattern.length() - 3);
}
while (pattern.startsWith("**/**/")) {
pattern = pattern.substring(3);
}
pattern = pattern.replace("/**/**/", "/**/");
// Escape special characters, including braces
// Braces from user input must be literals; we'll inject our own braces for expansion below
pattern = pattern.replace("\\", "\\\\")
.replace("[", "\\[")
.replace("]", "\\]")
.replace("{", "\\{")
.replace("}", "\\}");
// Transform ** patterns to use brace expansion for POSIX behavior
// This replaces the complex addPatternsWithOneDirRemoved logic
// We perform this after escaping so that only these injected braces participate in expansion
pattern = pattern.replace("**/", "{**/,}");
normalized.add(DEFAULT_SYNTAX + pattern);
} else {
normalized.add(pattern);
}
}
}
return simplify(normalized, excludes);
}
/**
* Applies some heuristic rules for simplifying the set of patterns,
* then returns the patterns as an array.
*
* @param patterns the patterns to simplify and return as an array
* @param excludes whether the patterns are exclude patterns
* @return the set content as an array, after simplification
*/
private static String[] simplify(Set<String> patterns, boolean excludes) {
/*
* If the "**" pattern is present, it makes all other patterns useless.
* In the case of include patterns, an empty set means to include everything.
*/
if (patterns.remove("**")) {
patterns.clear();
if (excludes) {
patterns.add("**");
}
}
return patterns.toArray(String[]::new);
}
/**
* Eventually adds the parent directory of the given patterns, without duplicated values.
* The patterns given to this method should have been normalized.
*
* @param patterns the normalized include or exclude patterns
* @param excludes whether the patterns are exclude patterns
* @return patterns of directories to include or exclude
*/
private static String[] directoryPatterns(final String[] patterns, final boolean excludes) {
// TODO: use `LinkedHashSet.newLinkedHashSet(int)` instead with JDK19.
final var directories = new LinkedHashSet<String>(patterns.length);
for (String pattern : patterns) {
if (pattern.startsWith(DEFAULT_SYNTAX)) {
if (excludes) {
if (pattern.endsWith("/**")) {
directories.add(pattern.substring(0, pattern.length() - 3));
}
} else {
int s = pattern.indexOf(':');
if (pattern.regionMatches(++s, "**/", 0, 3)) {
s = pattern.indexOf('/', s + 3);
if (s < 0) {
return new String[0]; // Pattern is "**", so we need to accept everything.
}
directories.add(pattern.substring(0, s));
}
}
}
}
return simplify(directories, excludes);
}
/**
* Returns {@code true} if at least one pattern requires path being relativized before to be matched.
*
* @param patterns include or exclude patterns
* @return whether at least one pattern require relativization
*/
private static boolean needRelativize(String[] patterns) {
for (String pattern : patterns) {
if (!pattern.startsWith(DEFAULT_SYNTAX + "**/")) {
return true;
}
}
return false;
}
/**
* Creates the path matchers for the given patterns.
* The syntax (usually {@value #DEFAULT_SYNTAX}) must be specified for each pattern.
*/
private static PathMatcher[] matchers(final FileSystem fs, final String[] patterns) {
final var matchers = new PathMatcher[patterns.length];
for (int i = 0; i < patterns.length; i++) {
matchers[i] = fs.getPathMatcher(patterns[i]);
}
return matchers;
}
/**
* {@return a potentially simpler matcher equivalent to this matcher}.
*/
@SuppressWarnings("checkstyle:MissingSwitchDefault")
private PathMatcher simplify() {
if (!needRelativize && excludes.length == 0) {
switch (includes.length) {
case 0:
return INCLUDES_ALL;
case 1:
return includes[0];
}
}
return this;
}
/**
* Determines whether a path is selected.
* This is true if the given file matches an include pattern and no exclude pattern.
*
* @param path the pathname to test, must not be {@code null}
* @return {@code true} if the given path is selected, {@code false} otherwise
*/
@Override
public boolean matches(Path path) {
if (needRelativize) {
path = baseDirectory.relativize(path);
}
return (includes.length == 0 || isMatched(path, includes))
&& (excludes.length == 0 || !isMatched(path, excludes));
}
/**
* {@return whether the given file matches according to one of the given matchers}.
*/
private static boolean isMatched(Path path, PathMatcher[] matchers) {
for (PathMatcher matcher : matchers) {
if (matcher.matches(path)) {
return true;
}
}
return false;
}
/**
* Returns whether {@link #couldHoldSelected(Path)} may return {@code false} for some directories.
* This method can be used to determine if directory filtering optimization is possible.
*
* @return {@code true} if directory filtering is possible, {@code false} if all directories
* will be considered as potentially containing selected files
*/
boolean canFilterDirectories() {
return dirIncludes.length != 0 || dirExcludes.length != 0;
}
/**
* Determines whether a directory could contain selected paths.
*
* @param directory the directory pathname to test, must not be {@code null}
* @return {@code true} if the given directory might contain selected paths, {@code false} if the
* directory will definitively not contain selected paths
*/
public boolean couldHoldSelected(Path directory) {
if (baseDirectory.equals(directory)) {
return true;
}
directory = baseDirectory.relativize(directory);
return (dirIncludes.length == 0 || isMatched(directory, dirIncludes))
&& (dirExcludes.length == 0 || !isMatched(directory, dirExcludes));
}
/**
* Appends the elements of the given array in the given buffer.
* This is a helper method for {@link #toString()} implementations.
*
* @param buffer the buffer to add the elements to
* @param label label identifying the array of elements to add
* @param patterns the elements to append, or {@code null} if none
*/
private static void append(StringBuilder buffer, String label, String[] patterns) {
buffer.append(label).append(": [");
if (patterns != null) {
for (int i = 0; i < patterns.length; i++) {
if (i != 0) {
buffer.append(", ");
}
buffer.append(patterns[i]);
}
}
buffer.append(']');
}
/**
* {@return a string representation for logging purposes}.
*/
@Override
public String toString() {
var buffer = new StringBuilder();
append(buffer, "includes", includePatterns);
append(buffer.append(", "), "excludes", excludePatterns);
return buffer.toString();
}
}