# pylint: disable=C0321,C0103,E1221,C0301,E1305,E1121,C0302,C0330
# -*- coding: utf-8 -*-
"""
Methods for feature extraction and preprocessing
util_feature: input/output is pandas
#####################################################################################################
######### Term Frequency ##########################################################################
If you need the term frequency (term count) vectors for different tasks, use Tfidftransformer.
If you need to compute tf-idf scores on documents within your “training” dataset, use Tfidfvectorizer
If you need to compute tf-idf scores on documents outside your “training” dataset, use either one, both will work.
#####################################################################################################
### The sklearn.feature_extraction.text submodule gathers utilities to build feature vectors from text documents.
feature_extraction.text.CountVectorizer([ÿ]) Convert a collection of text documents to a matrix of token counts
feature_extraction.text.HashingVectorizer([ÿ]) Convert a collection of text documents to a matrix of token occurrences
feature_extraction.text.TfidfVectorizer([ÿ]) Convert a collection of raw documents to a matrix of TF-IDF features.
"""
import copy
import json
import math
import os
import re
import string
from collections import Counter, OrderedDict
import numpy as np
import pandas as pd
import scipy as sci
import nltk
import sklearn as sk
########### Local Import #####################################################################
from nltk.corpus import stopwords
# Stemming and Lemmatizing
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn import preprocessing
from sklearn.feature_extraction.text import (CountVectorizer, TfidfTransformer,
TfidfVectorizer)
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
# import spacy
# import gensim
from dsa.da.model.column_encoder import MinHashEncoder
print("os.getcwd", os.getcwd())
##############################################################################################
punctuations = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
#############################################################################################
#############################################################################################
def get_stopwords(lang):
if lang == "en":
return json.load(open("stopwords_en.json"))["word"]
porter = PorterStemmer()
def coltext_stemporter(text):
# data_stem['TWEET_SENT_1'] = data_stem['TWEET_SENT_1'].apply(stem_texts)
tokens = text.split(" ")
stemmed_tokens = [porter.stem(token) for token in tokens]
return " ".join(stemmed_tokens)
wordnet = WordNetLemmatizer()
def coltext_lemmatizer(text):
# data_stem['TWEET_SENT_1'] = data_stem['TWEET_SENT_1'].apply(stem_texts)
tokens = text.split()
stemmed_tokens = [wordnet.stem(token) for token in tokens]
return " ".join(stemmed_tokens)
snowball = SnowballStemmer("english")
def coltext_stemmer(text, sep=" "):
tokens = text.split(sep)
stemmed_tokens = [snowball.stem(token) for token in tokens]
return " ".join(stemmed_tokens)
def coltext_stopwords(text, stopwords=None, sep=" "):
tokens = text.split(sep)
tokens = [t.strip() for t in tokens if t.strip() not in stopwords]
return " ".join(tokens)
def pd_coltext_fillna(df, colname, val=""):
return df[colname].fillna(val)
def pd_coltext_clean(dfref, colname, stopwords):
if isinstance(colname, str):
raise Exception("colname should be list of colname")
df = dfref[colname]
# fromword = [ r"\b({w})\b".format(w=w) for w in fromword ]
# print(fromword)
for col in colname:
df[col] = df[col].fillna("")
df[col] = df[col].str.lower()
df[col] = df[col].apply(lambda x: x.translate(string.punctuation))
df[col] = df[col].apply(lambda x: x.translate(string.digits))
df[col] = df[col].apply(lambda x: re.sub("[!@,#$+%*:()'-]", " ", x))
df[col] = df[col].apply(lambda x: coltext_stopwords(x, stopwords=stopwords))
return df
def pd_coltext_clean_advanced(dfref, colname, fromword, toword):
df = dfref[colname]
# fromword = [r"\b({w})\b".format(w=w) for w in fromword]
fromword = set(fromword)
# print(fromword)
for col in colname:
df[col] = df[col].fillna("")
df[col] = df[col].str.lower()
df[col] = df[col].replace(fromword, toword, regex=True)
return df
[docs]def pd_coltext_wordfreq(df, coltext, sep=" "):
"""
:param df:
:param coltext: text where word frequency should be extracted
:param nb_to_show:
:return:
"""
dfres = df[coltext].apply(lambda x: pd.value_counts(x.split(sep))).sum(axis=0).reset_index()
dfres.columns = ["word", "freq"]
dfres = dfres.sort_values("freq", ascending=0)
return dfres
[docs]def pd_fromdict(ddict, colname):
"""
:param ddict:
:param colname:
:return:
"""
colname = ("c0", "c1") if colname is None else colname
klist, xlist = [], []
for k, x in ddict.items():
klist.append(k)
xlist.append(x)
df = pd.DataFrame({colname[0]: klist, colname[1]: xlist})
df = df.sort_values(by=colname[1], ascending=False)
return df
[docs]def pd_coltext_encoder(df):
"""
https://dirty-cat.github.io/stable/auto_examples/02_fit_predict_plot_employee_salaries.html#sphx-glr-auto-examples-02-fit-predict-plot-employee-salaries-py
:param df:
:return:
"""
pass
[docs]def pd_coltext_countvect(
df, coltext, word_tokeep=None, word_minfreq=1, return_val="dataframe,param"
):
"""
Function that adds count of a given column for words in a text corpus.
Arguments:
df: original dataframe
word_tokeep: corpus of words to look into
coltext: column of df to apply tf-idf to
Returns:
concat_df: dataframe with a new column for each word
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
"""
if not isinstance(coltext, str):
raise Exception("coltext should be column string")
# Calculate count word
vect = CountVectorizer(
min_df=word_minfreq,
ngram_range=(1, 3),
strip_accents="unicode",
lowercase=True,
analyzer="word",
token_pattern=r"\w+",
stop_words=None,
)
if word_tokeep is None:
v = vect.fit_transform(df[coltext])
else:
vect.fit(word_tokeep)
v = vect.transform(df[coltext])
v = v.toarray()
voca = vect.get_feature_names()
# print(v.shape)
count_list = np.asarray(v.sum(axis=0))
word_dict = dict(zip(word_tokeep, count_list))
# print(len(word_tokeep))
# voca = vect.vocabulary_
df_vector = pd.DataFrame(v)
df_vector.columns = vect.vocabulary_
if return_val == "dataframe,param":
return df_vector, word_dict
else:
return df_vector
[docs]def pd_coltext_tdidf(df, coltext, word_tokeep=None, word_minfreq=1, return_val="dataframe,param"):
"""
Function that adds tf-idf of a given column for words in a text corpus.
Arguments:
df: original dataframe
word_tokeep: corpus of words to look into
col_tofilter: column of df to apply tf-idf to
Returns:
concat_df: dataframe with a new column for each word
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
"""
from sklearn.feature_extraction.text import CountVectorizer
if not isinstance(coltext, str):
raise Exception("coltext should be string")
if word_tokeep is None:
cv = CountVectorizer(
min_df=1,
ngram_range=(1, 3),
strip_accents="unicode",
lowercase=True,
analyzer="word",
token_pattern=r"\w+",
stop_words=None,
)
X = cv.fit_transform(df[coltext])
word_tokeep = cv.get_feature_names()
count_list = np.asarray(X.sum(axis=0))
word_dict = dict(zip(word_tokeep, count_list))
# print(len(word_tokeep))
# Calculate td-idf vector
vectorizer = TfidfVectorizer()
vectorizer.fit(word_tokeep)
v = vectorizer.transform(df[coltext])
v = v.toarray()
# print(v.shape)
voca = vectorizer.vocabulary_
df_vector = pd.DataFrame(v)
# df_new = pd.concat([df, df_vector],axis=1)
if return_val == "dataframe,param":
return df_vector, voca
else:
return df_vector
[docs]def pd_coltext_minhash(
dfref, colname, n_component=2, model_pretrain_dict=None, return_val="dataframe,param"
):
"""
dfhash, colcat_hash_param = pd_colcat_minhash(df, colcat, n_component=[2] * len(colcat),
return_val="dataframe,param")
:param dfref:
:param colname:
:param n_component:
:param return_val:
:return:
"""
df = dfref[colname]
model_pretrain_dict = {} if model_pretrain_dict is None else model_pretrain_dict
enc_dict = {}
for i, col in enumerate(colname):
if model_pretrain_dict.get(col) is None:
clf = MinHashEncoder(n_component[i])
clf = clf.fit(df[col])
else:
clf = copy.deepcopy(model_pretrain_dict[col])
v = clf.transform(df[col].values)
enc_dict[col] = copy.deepcopy(clf)
dfcat = pd.DataFrame(
v, columns=["{col}_hash_{t}".format(col=col, t=t) for t in range(0, v.shape[1])]
)
try:
dfall = pd.concat((dfall, dfcat), axis=1)
except:
dfall = dfcat
if return_val == "dataframe,param":
return dfall, enc_dict
else:
return dfall
[docs]def pd_coltext_hashing(df, coltext, n_features=20):
"""
Function that adds Hash a given column for words in a text corpus.
Arguments:
df: original dataframe
word_tokeep: corpus of words to look into
col_tofilter: column of df to apply tf-idf to
Returns:
concat_df: dataframe with a new column for each word
"""
from sklearn.feature_extraction.text import HashingVectorizer
vectorizer = HashingVectorizer(n_features=n_features)
vector = vectorizer.transform(df[coltext])
print(vector.shape)
colname = ["c" + str(i) for i in range(0, n_features)]
df_vector = pd.DataFrame(vector.toarray(), columns=colname)
return df_vector
def pd_coltext_tdidf_multi(
df,
coltext,
coltext_freq,
ntoken=100,
word_tokeep_dict=None,
stopwords=None,
return_val="dataframe,param",
):
dftext_tdidf = {}
word_tokeep_dict_new = {}
for col in coltext:
if word_tokeep_dict is None:
word_tokeep = coltext_freq[col]["word"].values[:ntoken]
word_tokeep = [t for t in word_tokeep if t not in stopwords]
else:
word_tokeep = word_tokeep_dict[col]
dftext_tdidf[col], word_tokeep_dict_new[col] = pd_coltext_tdidf(
df, col, word_tokeep=word_tokeep, word_minfreq=1, return_val="dataframe,param"
)
if return_val == "dataframe,param":
return dftext_tdidf, word_tokeep_dict_new
else:
return dftext_tdidf