/*
 * Decompiled with CFR 0.152.
 */
package com.agentsflex.core.document.splitter;

import com.agentsflex.core.document.Document;
import com.agentsflex.core.document.DocumentSplitter;
import com.agentsflex.core.document.id.DocumentIdGenerator;
import com.agentsflex.core.util.StringUtil;
import com.knuddels.jtokkit.Encodings;
import com.knuddels.jtokkit.api.Encoding;
import com.knuddels.jtokkit.api.EncodingRegistry;
import com.knuddels.jtokkit.api.EncodingType;
import com.knuddels.jtokkit.api.IntArrayList;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class SimpleTokenizeSplitter
implements DocumentSplitter {
    private EncodingRegistry registry = Encodings.newLazyEncodingRegistry();
    private EncodingType encodingType = EncodingType.CL100K_BASE;
    private int chunkSize;
    private int overlapSize;

    public SimpleTokenizeSplitter(int chunkSize) {
        this.chunkSize = chunkSize;
        if (this.chunkSize <= 0) {
            throw new IllegalArgumentException("chunkSize must be greater than 0, chunkSize: " + this.chunkSize);
        }
    }

    public SimpleTokenizeSplitter(int chunkSize, int overlapSize) {
        this.chunkSize = chunkSize;
        this.overlapSize = overlapSize;
        if (this.chunkSize <= 0) {
            throw new IllegalArgumentException("chunkSize must be greater than 0, chunkSize: " + this.chunkSize);
        }
        if (this.overlapSize >= this.chunkSize) {
            throw new IllegalArgumentException("overlapSize must be less than chunkSize, overlapSize: " + this.overlapSize + ", chunkSize: " + this.chunkSize);
        }
    }

    public int getChunkSize() {
        return this.chunkSize;
    }

    public void setChunkSize(int chunkSize) {
        this.chunkSize = chunkSize;
    }

    public int getOverlapSize() {
        return this.overlapSize;
    }

    public void setOverlapSize(int overlapSize) {
        this.overlapSize = overlapSize;
    }

    public EncodingRegistry getRegistry() {
        return this.registry;
    }

    public void setRegistry(EncodingRegistry registry) {
        this.registry = registry;
    }

    public EncodingType getEncodingType() {
        return this.encodingType;
    }

    public void setEncodingType(EncodingType encodingType) {
        this.encodingType = encodingType;
    }

    @Override
    public List<Document> split(Document document, DocumentIdGenerator idGenerator) {
        int index;
        if (document == null || StringUtil.noText(document.getContent())) {
            return Collections.emptyList();
        }
        String content = document.getContent();
        Encoding encoding = this.registry.getEncoding(this.encodingType);
        List tokens = encoding.encode(content).boxed();
        int currentIndex = index = 0;
        int maxIndex = tokens.size();
        ArrayList<Document> chunks = new ArrayList<Document>();
        while (currentIndex < maxIndex) {
            boolean lastIsReplacement;
            int endIndex = Math.min(currentIndex + this.chunkSize, maxIndex);
            List chunkTokens = tokens.subList(currentIndex, endIndex);
            IntArrayList intArrayList = new IntArrayList();
            for (Integer chunkToken : chunkTokens) {
                intArrayList.add(chunkToken.intValue());
            }
            String chunkText = encoding.decode(intArrayList).trim();
            if (chunkText.isEmpty()) continue;
            boolean firstIsReplacement = chunkText.charAt(0) == '\ufffd';
            boolean bl = lastIsReplacement = chunkText.charAt(chunkText.length() - 1) == '\ufffd';
            if (firstIsReplacement || lastIsReplacement) {
                if (firstIsReplacement) {
                    --currentIndex;
                }
                if (lastIsReplacement) {
                    ++endIndex;
                }
                chunkTokens = tokens.subList(currentIndex, endIndex);
                intArrayList = new IntArrayList();
                for (Integer chunkToken : chunkTokens) {
                    intArrayList.add(chunkToken.intValue());
                }
                chunkText = encoding.decode(intArrayList).trim();
            }
            currentIndex = currentIndex + this.chunkSize - this.overlapSize;
            Document newDocument = new Document();
            newDocument.addMetadata(document.getMetadataMap());
            newDocument.setContent(chunkText);
            newDocument.setId(idGenerator == null ? null : idGenerator.generateId(newDocument));
            chunks.add(newDocument);
        }
        return chunks;
    }
}

