GlobRegexUtil.java

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.common;

import com.google.re2j.Pattern;
import com.google.re2j.PatternSyntaxException;
import dk.brics.automaton.Automaton;
import dk.brics.automaton.BasicAutomata;
import dk.brics.automaton.BasicOperations;

import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

/**
 * Utility to convert a restricted glob pattern into a regex.
 *
 * Supported glob syntax:
 * - '*' matches any sequence of characters
 * - '?' matches any single character
 * - '[...]' matches any character in the brackets
 * - '[!...]' matches any character not in the brackets
 * - '\\' escapes the next character
 */
public final class GlobRegexUtil {
    // Small LRU to cap compiled pattern memory.
    private static final int REGEX_CACHE_CAPACITY = 256;
    private static final Map<String, Pattern> REGEX_CACHE = new LinkedHashMap<String, Pattern>(
            REGEX_CACHE_CAPACITY, 0.75f, true) {
        @Override
        protected boolean removeEldestEntry(Map.Entry<String, Pattern> eldest) {
            return size() > REGEX_CACHE_CAPACITY;
        }
    };

    private GlobRegexUtil() {
    }

    public static Pattern getOrCompilePattern(String globPattern) {
        synchronized (REGEX_CACHE) {
            Pattern cached = REGEX_CACHE.get(globPattern);
            if (cached != null) {
                return cached;
            }
            String regex = globToRegex(globPattern);
            Pattern compiled;
            try {
                compiled = Pattern.compile(regex);
            } catch (PatternSyntaxException e) {
                throw new IllegalArgumentException("Invalid glob pattern: " + globPattern, e);
            }
            REGEX_CACHE.put(globPattern, compiled);
            return compiled;
        }
    }

    public static boolean matches(String globPattern, String candidate) {
        return getOrCompilePattern(globPattern).matcher(candidate).matches();
    }

    public static boolean isGlobSubsetOf(String subsetGlobPattern, String supersetGlobPattern) {
        return globToAutomaton(subsetGlobPattern).subsetOf(globToAutomaton(supersetGlobPattern));
    }

    private static Automaton globToAutomaton(String pattern) {
        List<Automaton> automata = new ArrayList<>();
        boolean isEscaped = false;
        int patternLength = pattern.length();
        for (int index = 0; index < patternLength; index++) {
            char currentChar = pattern.charAt(index);
            if (isEscaped) {
                automata.add(BasicAutomata.makeChar(currentChar));
                isEscaped = false;
                continue;
            }
            if (currentChar == '\\') {
                isEscaped = true;
                continue;
            }
            if (currentChar == '*') {
                automata.add(BasicAutomata.makeAnyString());
                continue;
            }
            if (currentChar == '?') {
                automata.add(BasicAutomata.makeAnyChar());
                continue;
            }
            if (currentChar == '[') {
                CharacterClass characterClass = parseCharacterClass(pattern, index);
                automata.add(characterClass.toAutomaton(pattern));
                index = characterClass.closeIndex;
                continue;
            }
            automata.add(BasicAutomata.makeChar(currentChar));
        }
        if (isEscaped) {
            automata.add(BasicAutomata.makeChar('\\'));
        }
        if (automata.isEmpty()) {
            return BasicAutomata.makeEmptyString();
        }
        return BasicOperations.concatenate(automata);
    }

    public static String globToRegex(String pattern) {
        StringBuilder regexBuilder = new StringBuilder();
        regexBuilder.append("^");
        boolean isEscaped = false;
        int patternLength = pattern.length();
        for (int index = 0; index < patternLength; index++) {
            char currentChar = pattern.charAt(index);
            if (isEscaped) {
                appendEscapedRegexChar(regexBuilder, currentChar);
                isEscaped = false;
                continue;
            }
            if (currentChar == '\\') {
                isEscaped = true;
                continue;
            }
            if (currentChar == '*') {
                regexBuilder.append(".*");
                continue;
            }
            if (currentChar == '?') {
                regexBuilder.append('.');
                continue;
            }
            if (currentChar == '[') {
                CharacterClass characterClass = parseCharacterClass(pattern, index);
                regexBuilder.append('[');
                if (characterClass.negated) {
                    regexBuilder.append('^');
                }
                regexBuilder.append(characterClass.classChars).append(']');
                index = characterClass.closeIndex;
                continue;
            }
            appendEscapedRegexChar(regexBuilder, currentChar);
        }
        if (isEscaped) {
            appendEscapedRegexChar(regexBuilder, '\\');
        }
        regexBuilder.append("$");
        return regexBuilder.toString();
    }

    private static CharacterClass parseCharacterClass(String pattern, int openIndex) {
        int classIndex = openIndex + 1;
        boolean negated = false;
        if (classIndex < pattern.length()
                && (pattern.charAt(classIndex) == '!' || pattern.charAt(classIndex) == '^')) {
            negated = true;
            classIndex++;
        }
        boolean classClosed = false;
        boolean isClassEscaped = false;
        StringBuilder classBuffer = new StringBuilder();
        int classCharCount = 0;
        for (; classIndex < pattern.length(); classIndex++) {
            char classChar = pattern.charAt(classIndex);
            if (isClassEscaped) {
                classBuffer.append(classChar);
                isClassEscaped = false;
                classCharCount++;
                continue;
            }
            if (classChar == '\\') {
                isClassEscaped = true;
                continue;
            }
            if (classChar == ']') {
                classClosed = true;
                break;
            }
            classBuffer.append(classChar);
            classCharCount++;
        }
        if (!classClosed) {
            throw new IllegalArgumentException("Unclosed character class in glob pattern: " + pattern);
        }
        if (classCharCount == 0) {
            throw new IllegalArgumentException("Empty character class in glob pattern: " + pattern);
        }
        return new CharacterClass(classBuffer.toString(), negated, classIndex);
    }

    private static Automaton makeCharacterClassAutomaton(String classChars, boolean negated, String pattern) {
        List<Automaton> automata = new ArrayList<>();
        for (int index = 0; index < classChars.length(); index++) {
            char start = classChars.charAt(index);
            if (index + 2 < classChars.length() && classChars.charAt(index + 1) == '-') {
                char end = classChars.charAt(index + 2);
                if (start > end) {
                    throw new IllegalArgumentException("Invalid character range in glob pattern: " + pattern);
                }
                automata.add(BasicAutomata.makeCharRange(start, end));
                index += 2;
            } else {
                automata.add(BasicAutomata.makeChar(start));
            }
        }
        Automaton positive = automata.size() == 1 ? automata.get(0) : BasicOperations.union(automata);
        if (!negated) {
            return positive;
        }
        return BasicOperations.minus(BasicAutomata.makeAnyChar(), positive);
    }

    private static void appendEscapedRegexChar(StringBuilder regexBuilder, char ch) {
        switch (ch) {
            case '.':
            case '^':
            case '$':
            case '+':
            case '*':
            case '?':
            case '(':
            case ')':
            case '|':
            case '{':
            case '}':
            case '[':
            case ']':
            case '\\':
                regexBuilder.append('\\').append(ch);
                break;
            default:
                regexBuilder.append(ch);
                break;
        }
    }

    private static final class CharacterClass {
        private final String classChars;
        private final boolean negated;
        private final int closeIndex;

        private CharacterClass(String classChars, boolean negated, int closeIndex) {
            this.classChars = classChars;
            this.negated = negated;
            this.closeIndex = closeIndex;
        }

        private Automaton toAutomaton(String pattern) {
            return makeCharacterClassAutomaton(classChars, negated, pattern);
        }
    }
}