/*
 * Decompiled with CFR 0.152.
 */
package com.github.tjake.jlama.model.gemma;

import com.github.tjake.jlama.safetensors.tokenizer.BPETokenizer;
import java.nio.file.Path;
import java.util.Optional;

public class GemmaTokenizer
extends BPETokenizer {
    static final String SPIECE_UNDERLINE = "\u2581";
    private final int byteFallbackEncodingOffset;

    public GemmaTokenizer(Path modelRoot) {
        super(modelRoot);
        this.byteFallbackEncodingOffset = 217;
    }

    @Override
    protected long encodeCharacterAsToken(byte c) {
        return Byte.toUnsignedLong(c) + (long)this.byteFallbackEncodingOffset;
    }

    @Override
    protected Optional<Character> maybeDecodeTokenAsCharacter(long id) {
        if (this.model.byteFallback && id >= (long)this.byteFallbackEncodingOffset && id < (long)(256 + this.byteFallbackEncodingOffset)) {
            char c = (char)(id - (long)this.byteFallbackEncodingOffset);
            return Optional.of(Character.valueOf(c));
        }
        return Optional.empty();
    }

    @Override
    protected String preProcess(String sentence) {
        sentence = sentence.replace(" ", SPIECE_UNDERLINE);
        return sentence;
    }

    @Override
    protected String postProcess(String sentence) {
        return sentence.stripLeading();
    }

    @Override
    protected String postProcessToken(String decoded) {
        if (decoded == null) {
            decoded = this.model.unkToken;
        }
        decoded = decoded.replaceAll("</?s>", "");
        decoded = decoded.replaceAll(SPIECE_UNDERLINE, " ");
        return decoded;
    }
}

