# coding: utf8

import pandas as pd
import jieba
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


def text2words(text):
    return "#BEGIN " + " ".join([word.lower() for word in jieba.cut(text)]) + " #END"


train = pd.read_csv("input/train.csv", header=None, sep="\t")
train[0] = train[0].apply(text2words)
tokenizer = Tokenizer(num_words=None)
tokenizer.fit_on_texts(train[0].tolist())
sequences = tokenizer.texts_to_sequences(train[0])
train_features = pad_sequences(sequences, maxlen=15)