package com.datumbox.opensource.features;

import com.datumbox.opensource.dataobjects.Document;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;

/* loaded from: classes.dex */
public class TextTokenizer {
    public static String[] extractKeywords(String str) {
        return str.split(StringUtils.SPACE);
    }

    public static Map<String, Integer> getKeywordCounts(String[] strArr) {
        HashMap hashMap = new HashMap();
        for (int i = 0; i < strArr.length; i++) {
            Integer num = (Integer) hashMap.get(strArr[i]);
            if (num == null) {
                num = 0;
            }
            hashMap.put(strArr[i], Integer.valueOf(num.intValue() + 1));
        }
        return hashMap;
    }

    public static String preprocess(String str) {
        return str.replaceAll("\\p{P}", StringUtils.SPACE).replaceAll("\\s+", StringUtils.SPACE).toLowerCase(Locale.getDefault());
    }

    public static Document tokenize(String str) {
        String[] extractKeywords = extractKeywords(preprocess(str));
        Document document = new Document();
        document.tokens = getKeywordCounts(extractKeywords);
        return document;
    }
}
