/*
 * Decompiled with CFR 0.152.
 */
package com.ibm.bi.search.extract;

import com.ibm.bi.search.extract.AbstractExtractData;
import java.io.IOException;
import java.io.InputStream;
import java.lang.invoke.MethodHandles;
import java.util.HashSet;
import java.util.Set;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ExtractHTMLData
extends AbstractExtractData {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

    ExtractHTMLData(String fileName, boolean compressed) throws IOException {
        super(fileName, compressed);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    @Override
    public String analyzeData() throws IOException {
        String extractedContent;
        LOG.trace("\t extracting HTML text file {} of size {}", (Object)this.extractFileName, (Object)this.getUncompressedFileSize());
        if (this.getUncompressedFileSize() > 0xA00000L) {
            LOG.warn("\t extract HTML text failed: {} size over limit of {}", (Object)this.extractFileName, (Object)0xA00000L);
            this.fileInput.close();
            return null;
        }
        HashSet<String> contentSet = new HashSet<String>();
        try {
            Document htmlDoc = Jsoup.parse((InputStream)this.fileInput, (String)"UTF-8", (String)"");
            String title = htmlDoc.title();
            contentSet.add(title);
            String bodyText = htmlDoc.body().text();
            Set<String> bodySet = this.tokenizeString(bodyText, " ");
            contentSet.addAll(bodySet);
            extractedContent = this.convertSetToString(contentSet);
        }
        catch (Exception e) {
            LOG.warn("\t extract HTML text failed: " + this.extractFileName + " failed to process file", (Throwable)e);
            String string = null;
            return string;
        }
        finally {
            this.fileInput.close();
        }
        return extractedContent;
    }
}

