/*
 * Decompiled with CFR 0.152.
 */
package com.ibm.smarts.similarity.classifier.builder.core;

import com.ibm.smarts.model.builder.ColumnEmbedding;
import com.ibm.smarts.model.builder.SampleExtraFeatures;
import com.ibm.smarts.schema.BaseItemObject;
import com.ibm.smarts.schema.ColumnInfo;
import com.ibm.smarts.schema.DatasetInfo;
import com.ibm.smarts.schema.SemanticInfo;
import com.ibm.smarts.schema.util.SmartsModuleUtil;
import com.ibm.smarts.similarity.classifier.common.core.ColumnEncodingResult;
import com.ibm.smarts.similarity.classifier.common.core.IEncoder;
import com.ibm.smarts.similarity.classifier.common.utils.SimilarityUtility;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class EncodeDatasetHelper {
    private static final int MAX_SAMPLE_LENGTH = 200;
    private static final int MAX_SAMPLES = 200;
    private static final Logger LOGGER = LoggerFactory.getLogger(EncodeDatasetHelper.class);

    private EncodeDatasetHelper() {
    }

    public static List<ColumnEmbedding> encodeDataset(DatasetInfo datasetInfo, IEncoder encoder, boolean useLengthInfo) {
        long startTime = System.currentTimeMillis();
        ArrayList<ColumnEmbedding> results = new ArrayList<ColumnEmbedding>();
        SmartsModuleUtil.getFlattenedColumns((BaseItemObject)datasetInfo).forEach(col -> {
            if (SimilarityUtility.isEncodable((ColumnInfo)col) && !SimilarityUtility.isSearchable((SemanticInfo)col.getSemanticInfo())) {
                SampleExtraFeatures extraFeatures = null;
                List<String> sample = EncodeDatasetHelper.getCleanDistinctSamples(col, false);
                ColumnEncodingResult columnEncodingResult = encoder.encodeColumn(sample);
                List centroids = columnEncodingResult.getCentroids();
                if (!centroids.isEmpty()) {
                    if (useLengthInfo) {
                        extraFeatures = encoder.createSampleExtraFeatures(sample);
                    }
                    ColumnEmbedding columnEmbedding = new ColumnEmbedding(col.getId(), col.getIdForExpression(), datasetInfo.getId(), centroids, columnEncodingResult.getOutliers(), extraFeatures);
                    results.add(columnEmbedding);
                }
            }
        });
        LOGGER.debug("Encoded Dataset [{}] in {}ms. Number of encoded columns={}", new Object[]{datasetInfo.getId(), System.currentTimeMillis() - startTime, results.size()});
        return results;
    }

    private static List<String> getCleanDistinctSamples(ColumnInfo columnInfo, boolean useColumnName) {
        List<String> samples = columnInfo.getSampleDistinctValues().stream().filter(Objects::nonNull).filter(s -> !StringUtils.isBlank((CharSequence)s)).filter(s -> s.length() < 200).limit(200L).collect(Collectors.toList());
        if (useColumnName) {
            samples.add(columnInfo.getName());
        }
        return samples;
    }
}

