import subprocess
import textacy
from textacy.representations.vectorizers import Vectorizer
import textacy.tm
from textacy import preprocessing
from . import Content
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
[docs]
class Qrmine(object):
def __init__(self):
self._min_occurrence_for_topic = 2
self._common_verbs = 10
# create an empty corpus
self._en = textacy.load_spacy_lang('en_core_web_sm')
self._corpus = textacy.Corpus(lang=self._en)
self._content = None
self._model = None
self._numdocs = 0
self._numtopics = 0
self._terms = None
self._doc_term_matrix = None
self._doc_topic_matrix = None
self._vectorizer = Vectorizer(tf_type='linear', idf_type='smooth',
norm='l2', min_df=2, max_df=0.95, max_n_terms=100000)
@property
def content(self):
return self._content
@content.setter
def content(self, content):
self._content = content
[docs]
def min_topic(self, min_topic):
self._min_occurrence_for_topic = min_topic
[docs]
def common_verbs(self, common_verbs):
self._common_verbs = common_verbs
[docs]
@staticmethod
def print_table(table):
col_width = [max(len(x) for x in col) for col in zip(*table)]
for line in table:
print("| " + " | ".join("{:{}}".format(x, col_width[i])
for i, x in enumerate(line)) + " |")
@property
def get_git_revision_hash(self):
return subprocess.check_output(['git', 'rev-parse', 'HEAD'])
@property
def get_git_revision_short_hash(self):
return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).strip().decode("utf-8")
# return subprocess.check_output(['git', 'log', '-1', '--format=%cd']).strip().decode("utf-8")[10:]
[docs]
def print_categories(self, doc, num=10):
textacy.spacier.extensions.set_doc_extensions("extract.bags")
bot = doc._.to_bag_of_terms(by='lemma_', weighting='freq',
ngs=(1,2,3), ents=True, ncs=True, dedupe=True)
categories = sorted(bot.items(), key=lambda x: x[1], reverse=True)[:num]
output = []
to_return = []
print("\n---Categories with count---")
output.append(("CATEGORY", "WEIGHT"))
for category, count in categories:
output.append((category, str(count)))
to_return.append(category)
self.print_table(output)
print("---------------------------\n")
return to_return
[docs]
def category_basket(self, num=10):
"""Generates a basket of categories for association
Args:
num (int, optional): number of categories to generate for each doc in corpus. Defaults to 10.
Returns:
list: The list of lists (each list is categories in each document)
"""
item_basket = []
for index, title in enumerate(self._content.titles): # QRMines content should be set
content = self._content.documents[index]
this_record = Content(content)
doc = textacy.make_spacy_doc(this_record.doc, lang=self._en)
item_basket.append(self.print_categories(doc, num))
return item_basket
# Example return:
# [['GT', 'Strauss', 'coding', 'ground', 'theory', 'seminal', 'Corbin', 'code',
# 'structure', 'ground theory'], ['category', 'theory', 'comparison', 'incident',
# 'GT', 'structure', 'coding', 'Classical', 'Grounded', 'Theory'],
# ['theory', 'GT', 'evaluation'], ['open', 'coding', 'category', 'QRMine',
# 'open coding', 'researcher', 'step', 'data', 'break', 'analytically'],
# ['ground', 'theory', 'GT', 'ground theory'], ['category', 'comparison', 'incident',
# 'category comparison', 'Theory', 'theory']]
[docs]
def category_association(self, num=10):
"""Generates the support for itemsets
Args:
num (int, optional): number of categories to generate for each doc in corpus. . Defaults to 10.
"""
basket = self.category_basket(num)
te = TransactionEncoder()
te_ary = te.fit(basket).transform(basket)
df = pd.DataFrame(te_ary, columns=te.columns_)
return apriori(df, min_support=0.6, use_colnames=True)
# Example
# support itemsets
# 0 0.666667 (GT)
# 1 0.833333 (theory)
# 2 0.666667 (theory, GT)
[docs]
def unique(self,list1):
# insert the list to the set
list_set = set(list1)
# convert the set to the list
return (list(list_set))
"""
test: test_generate_topics in test_nlp
"""
[docs]
def print_topics(self, numtopics=0):
if numtopics > 0:
self._numtopics = numtopics
topic_list = list(range(1, self._numtopics))
output = []
topics = [] # Topics are added here first to find unique topics
print("\n---Topics---")
output.append(("TOPIC", "DESCRIPTION"))
for topic_idx, top_terms in self._model.top_topic_terms(self._vectorizer.id_to_term, topics=topic_list):
# output.append(("TOPIC:" + str(topic_idx), ' '.join(top_terms)))
topics.append(' '.join(top_terms))
unique_topics = self.unique(topics)
ct = 0
# Finally added to output
for topic in unique_topics:
ct += 1
output.append(("TOPIC:" + str(ct), topic))
self.print_table(output)
print("---------------------------\n")
# https://www.pydoc.io/pypi/textacy-0.4.0/autoapi/tm/topic_model/index.html
# Rem stop words http://ai.intelligentonlinetools.com/ml/category/topic-modeling/
# To set Top N
[docs]
def print_documents(self, top_n=2):
topic_list = list(range(1, self._numtopics))
output = []
print("\n---Topics---")
output.append(("TOPIC", "DOCUMENTS"))
for topic_idx, top_docs in self._model.top_topic_docs(self._doc_topic_matrix, topics=topic_list,
top_n=top_n):
str_topic_idx = str(topic_idx)
for j in top_docs:
# output.append((str_topic_idx, self._corpus[j].metadata['title']))
output.append((str_topic_idx, self._corpus.docs[j]._.meta["title"]))
str_topic_idx = "..."
self.print_table(output)
print("---------------------------\n")
print("\n---Documents To Topics---")
for doc_idx, topics in self._model.top_doc_topics(self._doc_topic_matrix, docs=range(self._numdocs),
top_n=top_n):
# print(self._corpus[doc_idx].metadata['title'], ':', topics)
print(self._corpus.docs[doc_idx]._.meta["title"], ':', topics)
print("---------------------------\n")
[docs]
def print_dict(self, content, num=10):
output = []
print("\n---Coding Dictionary---")
output.append(("CATEGORY", "PROPERTY", "DIMENSION"))
words = content.common_verbs(num)
for word, f1 in words:
for attribute, f2 in content.attributes(word, 3):
for dimension, f3 in content.dimensions(attribute, 3):
output.append((word, attribute, dimension))
word = '...'
attribute = '...'
self.print_table(output)
print("---------------------------\n")
[docs]
def process_content(self):
if self._content is not None:
for ct, document in enumerate(self._content.documents):
metadata = {}
try:
metadata['title'] = self._content.titles[ct]
except IndexError:
metadata['title'] = 'Empty'
# self._corpus.add_text(textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True),
# metadata=metadata)
#doc_text = textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True)
# 2-Jan-2020 textacy new version, breaking change
# replace numbers with NUM, remove punct and convert to lower case
# doc_text = preprocessing.replace.replace_numbers(preprocessing.remove.remove_punctuation(document), 'NUM').lower()
doc_text = preprocessing.replace.numbers(preprocessing.remove.punctuation(document)).lower()
doc = textacy.make_spacy_doc((doc_text, metadata), lang=self._en)
self._corpus.add_doc(doc)
self.load_matrix()
[docs]
def filter_content(self, titles):
if self._content is not None:
for ct, document in enumerate(self._content.documents):
metadata = {}
try:
if any(self._content.titles[ct] in s for s in titles):
metadata['title'] = self._content.titles[ct]
# self._corpus.add_text(
# textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True),
# metadata=metadata)
#doc_text = textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True)
# doc_text = preprocessing.replace.replace_numbers(preprocessing.remove.remove_punctuation(document), 'NUM').lower()
doc_text = preprocessing.replace.numbers(preprocessing.remove.punctuation(document)).lower()
doc = textacy.make_spacy_doc((doc_text, metadata), lang=self._en)
self._corpus.add_doc(doc)
except IndexError:
metadata['title'] = 'Empty'
self.load_matrix()
[docs]
def load_matrix(self):
textacy.spacier.extensions.set_doc_extensions("extract.keyterms")
terms = ((term.text for term in textacy.extract.terms(doc, ngs=1, ents=True))for doc in self._corpus.docs)
self._doc_term_matrix = self._vectorizer.fit_transform(terms)
self._numdocs, self._terms = self._doc_term_matrix.shape
self._model = textacy.tm.TopicModel('lda', n_topics=self._numdocs)
self._model.fit(self._doc_term_matrix)
try:
self._doc_topic_matrix = self._model.transform(self._doc_term_matrix)
_, self._numtopics = self._doc_topic_matrix.shape
except ValueError:
print("No topics found")