文本分类 实践
spark版本#
数据预处理,对于中文需要先将原始的预料库进行分词。
预处理
from pyspark.sql.types import StringType, ArrayType, IntegerType, FloatType,MapType
from pyspark.sql.functions import udf, lit, size, explode
import numpy as np
import jieba
def get_stop_words():
dic = {}
for line in open('stop_words.txt'):
w = line.strip().split('\t')[0]
dic[w]= 0
return dic
## step1 基础分词 + 去除停用词
def get_corpus(raw_corpus):
stop_dict = get_stop_words()
raw_corpus = raw_corpus.replace('"', '').replace('[', '').replace(']', '')
cut = list(jieba.cut(raw_corpus.replace(' ','')))
corpus = [x for x in cut if x not in stop_dict]
return list(set(corpus))
udf_get_corpus = udf(get_corpus, StringType())
df = df.withColumn('corpus', udf_get_corpus(df.raw_corpus)).drop('raw_corpus')
- 去除停用词。可以通过udf的方式加载stop_words,或者使用spark自带的
转换向量
CountVectorizer#
from pyspark.ml.feature import CountVectorizer, HashingTF, IDF
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import CountVectorizer
count_vectorizer = CountVectorizer(inputCol='new_tokenized', outputCol='count_vector')
new_df = count_vectorizer.fit(new_df).transform(new_df)
查看转换后的词表:
count_vectorizer.fit(new_df).vocabulary
### Tf-idf
```python
hashing_vector = HashingTF(inputCol='new_tokenized', outputCol='tf_vector')
hashing_df = hashing_vector.transform(new_df)
tf_idf_vector = IDF(inputCol='tf_vector', outputCol='tf_idf_vector')
tf_idf_df = tf_idf_vector.fit(hashing_df).transform(hashing_df)
word2vec#
from pyspark.ml.feature import Word2Vec
w2v = Word2Vec(vectorSize=100, minCount=3,seed=123, numPartitions=64,inputCol="words", outputCol="result")
model = w2v.fit(new_df)
corpus_goods = model.transform(new_df)
简单LR回归#
## 建模
from pyspark.ml.classification import LogisticRegression
train, test = tf_idf_df.randomSplit([0.8, 0.2])
model = LogisticRegression(featuresCol='tf_idf_vector', labelCol='label').fit(train)
result = model.evaluate(test).predictions
auc = BinaryClassificationEvaluator(labelCol='label', rawPredictionCol = 'prediction').evaluate(result)
python版本#
参考#
案例
https://blog.csdn.net/u014281392/article/details/89972877
https://zhuanlan.zhihu.com/p/60532089
pyspark:
https://spark.apache.org/docs/3.3.2/ml-features.html#tf-idf