/*
 * Decompiled with CFR 0.152.
 */
package com.ibm.smarts.similarity.classifier.common.core;

import com.ibm.smarts.nlp.embedding.WordEmbedding;
import com.ibm.smarts.similarity.classifier.common.core.AbstractEncoder;
import com.ibm.smarts.similarity.classifier.common.core.ColumnEncodingResult;
import com.ibm.smarts.similarity.classifier.common.core.IOovEncoder;
import com.ibm.smarts.similarity.classifier.common.core.KMeans;
import com.ibm.smarts.similarity.classifier.common.core.KMeansResult;
import com.ibm.smarts.similarity.classifier.common.core.NormedVector;
import com.ibm.smarts.similarity.classifier.common.core.PreProcessor;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;

public class KMeansEncoder
extends AbstractEncoder {
    private static final int[] CLUSTER_NO = new int[]{7, 3, 3};
    private static final double NEARBY_THRESHOLD = 0.5;
    private static final int MAX_OUTLIERS = 10;

    public KMeansEncoder(WordEmbedding wordEmbedding) {
        super(wordEmbedding);
    }

    public KMeansEncoder(WordEmbedding wordEmbedding, IOovEncoder oovEncoder) {
        super(wordEmbedding, oovEncoder);
    }

    @Override
    public ColumnEncodingResult encodeColumn(List<String> samples) {
        ArrayList<String> remainingSamples = new ArrayList<String>();
        ArrayList<NormedVector> sampleEmbedding = new ArrayList<NormedVector>();
        for (String string : samples) {
            float[] embedding = this.encodeTokensViaSmoothedMax(PreProcessor.prepareSentenceForEncoding(string));
            if (embedding == null) continue;
            sampleEmbedding.add(new NormedVector(embedding));
            remainingSamples.add(string);
        }
        if (sampleEmbedding.isEmpty()) {
            return new ColumnEncodingResult(Collections.emptyList(), Collections.emptyList());
        }
        ArrayList<NormedVector> centroids = new ArrayList<NormedVector>();
        for (int c2 : CLUSTER_NO) {
            KMeansResult result = KMeans.fit(c2, sampleEmbedding);
            centroids.addAll(result.getCentroids());
            this.removeNearBySamples(remainingSamples, sampleEmbedding, result);
            if (sampleEmbedding.isEmpty() || remainingSamples.size() <= 10) break;
        }
        List<float[]> list = centroids.stream().filter(c -> c.getNorm() != 0.0).map(c -> c.getVector()).collect(Collectors.toList());
        return new ColumnEncodingResult(list, remainingSamples.subList(0, Math.min(10, remainingSamples.size())));
    }

    private void removeNearBySamples(List<String> remainingSamples, List<NormedVector> sampleEmbedding, KMeansResult result) {
        if (!remainingSamples.isEmpty()) {
            for (int s = result.getLables().length - 1; s >= 0; --s) {
                if (!(result.getLables()[s].getScore() > 0.5)) continue;
                sampleEmbedding.remove(s);
                remainingSamples.remove(s);
            }
        }
    }
}

