/*
 * Decompiled with CFR 0.152.
 */
package com.ibm.smarts.similarity.classifier.common.core;

import com.ibm.smarts.similarity.classifier.common.core.PreProcessor;
import com.ibm.smarts.similarity.classifier.common.core.Token;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Tokenizer {
    private static final Pattern specialCharsPattern = Pattern.compile("[\\p{Punct}&&[^.'-]]+");
    private static final Pattern splitPattern = Pattern.compile("\\S+");

    private Tokenizer() {
    }

    public static List<Token> tokenize(String text) {
        return Tokenizer.tokenize(text, false);
    }

    public static List<Token> tokenize(String text, boolean removeConjunctions) {
        String cleanText = specialCharsPattern.matcher(text).replaceAll(" ");
        Matcher matcher = splitPattern.matcher(cleanText);
        ArrayList<Token> tokens = new ArrayList<Token>();
        while (matcher.find()) {
            if (removeConjunctions && PreProcessor.isConjunctionWord(matcher.group())) continue;
            Token token = new Token(matcher.start(), matcher.group());
            tokens.add(token);
        }
        return tokens;
    }
}

