/*
 * Decompiled with CFR 0.152.
 */
package com.ibm.smarts.similarity.classifier.common.core;

import com.ibm.smarts.similarity.classifier.common.core.Token;
import com.ibm.smarts.similarity.classifier.common.core.Tokenizer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

public class PreProcessor {
    private static final String SPACE = " ";
    protected static final Set<String> ENGLISH_STOP_WORDS = new HashSet<String>(Arrays.asList("a", "also", "an", "and", "are", "average", "be", "column", "i", "the", "total", ".", "in", "include", "including", "inside", "exclude", "excluding", "outside", "not in", "not including", "about", "details", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "although", "always", "am", "among", "amongst", "amoungst", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "around", "as", "at", "back", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "either", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fill", "find", "for", "former", "formerly", "found", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "ie", "if", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "off", "often", "on", "once", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "take", "than", "that", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thick", "thin", "this", "those", "though", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "greater", "bigger", "larger", "smaller", "lower", "fewer", "avg", "count", "max", "sum", "how many", "how much", "min", "minimal", "minimum", "lowest", "maximal", "maximum", "highest", "not on", "not for", "chart", "charts", "influence", "influences", "tell", "impact", "impacts", "drive", "drives", "know", "knew", "create", "dashboard"));
    protected static final Pattern typeAheadPattern = Pattern.compile("(?i)create dashboard|generate dashboard|create chart|reset conversation|generate a dashboard|generate a chart|draw chart|load data|show column|show data|show details|show influencers|show source|tell me about|what do you know about|what impacts|what influences column|what influences|what is| generate questions|suggest questions|suggest starting points|where should I start?|what can you tell me about this dataset|what can I do here|I don't know where to start");
    public static final Set<String> CONJUNCTION_WORDS = new HashSet<String>(Arrays.asList("of"));

    private PreProcessor() {
    }

    public static boolean isStopWord(String word) {
        return ENGLISH_STOP_WORDS.contains(word.toLowerCase());
    }

    public static boolean isConjunctionWord(String word) {
        return CONJUNCTION_WORDS.contains(word.toLowerCase());
    }

    public static List<String> prepareSentenceForEncoding(String sentence) {
        return Tokenizer.tokenize(sentence).stream().map(Token::getText).filter(t -> !PreProcessor.isStopWord(t)).collect(Collectors.toList());
    }

    public static List<String> prepareSentenceForInference(String sentence) {
        return Tokenizer.tokenize(PreProcessor.removeAutoSuggestPhrases(sentence)).stream().map(Token::getText).filter(t -> !PreProcessor.isStopWord(t)).collect(Collectors.toList());
    }

    public static String removeAutoSuggestPhrases(String sentence) {
        return typeAheadPattern.matcher(sentence).replaceAll("");
    }

    public static Token removeTrailingAndLeadingStopWords(Token token) {
        String trailing = PreProcessor.removeTrailingStopWords(token.getText());
        String leading = PreProcessor.removeLeadingStopWords(trailing);
        int startCharIndex = token.getStartCharIndex() + token.getText().indexOf(leading.split(SPACE)[0]);
        return new Token(startCharIndex, leading);
    }

    public static String removeTrailingStopWords(String queryText) {
        ArrayList<String> words = new ArrayList<String>(Arrays.asList(queryText.split(SPACE)));
        for (int i = words.size() - 1; i > 0 && PreProcessor.isStopWord((String)words.get(i)); --i) {
            words.remove(i);
        }
        return String.join((CharSequence)SPACE, words);
    }

    public static String removeLeadingStopWords(String queryText) {
        String s;
        ArrayList<String> words = new ArrayList<String>(Arrays.asList(queryText.split(SPACE)));
        Iterator i = words.iterator();
        while (i.hasNext() && PreProcessor.isStopWord(s = (String)i.next())) {
            i.remove();
        }
        return String.join((CharSequence)SPACE, words);
    }
}

