Source code for qrmine.content

"""
Copyright (C) 2020 Bell Eapen

This file is part of qrmine.

qrmine is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

qrmine is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with qrmine.  If not, see <http://www.gnu.org/licenses/>.
"""

import operator

# import en_core_web_sm

import textacy



[docs]
class Content(object):
    def __init__(self, content="", title="", lang="en_core_web_sm", max_length=1100000):
        self._content = content
        # TODO, Title is not used
        self._title = title
        self._lang = lang
        self._nlp = textacy.load_spacy_lang(lang)
        self._nlp.max_length = max_length
        self._processed = self._nlp(self._content)
        self._lemma = {}
        self._pos = {}
        self._pos_ = {}
        self._word = {}
        self._sentiment = {}
        self._tag = {}
        self._dep = {}
        self._prob = {}
        self._idx = {}
        self.process()

    @property
    def content(self):
        return self._content

    @property
    def title(self):
        return self._title

    @content.setter
    def content(self, content):
        self._content = content

    @property
    def lemma(self, token):
        return self._lemma.get(token, "")

    @property
    def pos(self, token):
        return self._pos.get(token, "")

    @property
    def pos_(self, token):
        return self._pos_.get(token, 0)

    @property
    def word(self, token):
        return self._word.get(token, "")

    @property
    def sentiment(self, token):
        return self._sentiment.get(token, 0)

    @property
    def tag(self, token):
        return self._tag.get(token, "")

    @property
    def dep(self, token):
        return self._dep.get(token, 0)

    @property
    def prob(self, token):
        return self._prob.get(token, 0)

    @property
    def idx(self, token):
        return self._idx.get(token, 0)

    @property
    def doc(self):
        return self._processed

    @property
    def tokens(self):
        return [
            token.lemma_
            for token in self._processed
            if not token.is_stop and not token.is_punct and not token.is_space
        ]

    @property
    def lang(self):
        return self._lang

    @content.setter
    def content(self, content):
        self._content = content
        self._processed = self._nlp(self._content)
        self._lemma = {}
        self._pos = {}
        self._pos_ = {}
        self._word = {}
        self._sentiment = {}
        self._tag = {}
        self._dep = {}
        self._prob = {}
        self._idx = {}
        self.process()


[docs]
    def process(self):
        for token in self._processed:
            if token.is_stop or token.is_digit or token.is_punct or token.is_space:
                continue
            if token.like_url or token.like_num or token.like_email:
                continue
            if len(token.text) < 3 or token.text.isupper():
                continue
            # self._lemma[token] = token.lemma_
            self._pos[token] = token.pos_
            self._pos_[token] = token.pos
            self._word[token] = token.lemma_
            self._sentiment = token.sentiment
            self._tag = token.tag_
            self._dep = token.dep_
            self._prob = token.prob
            self._idx = token.idx



[docs]
    def common_words(self, index=10):
        _words = {}
        for key, value in self._word.items():
            _words[value] = _words.get(value, 0) + 1
        return sorted(_words.items(), key=operator.itemgetter(1), reverse=True)[:index]



[docs]
    def common_nouns(self, index=10):
        _words = {}
        for key, value in self._word.items():
            if self._pos.get(key, None) == "NOUN":
                _words[value] = _words.get(value, 0) + 1
        return sorted(_words.items(), key=operator.itemgetter(1), reverse=True)[:index]



[docs]
    def common_verbs(self, index=10):
        _words = {}
        for key, value in self._word.items():
            if self._pos.get(key, None) == "VERB":
                _words[value] = _words.get(value, 0) + 1
        return sorted(_words.items(), key=operator.itemgetter(1), reverse=True)[:index]



[docs]
    def sentences_with_common_nouns(self, index=10):
        _nouns = self.common_nouns(index)
        # Let's look at the sentences
        sents = []
        # the "sents" property returns spans
        # spans have indices into the original string
        # where each index value represents a token
        for span in self._processed.sents:
            # go from the start to the end of each span, returning each token in the sentence
            # combine each token using join()
            sent = "".join(
                self._processed[i].string for i in range(span.start, span.end)
            ).strip()
            for noun, freq in _nouns:
                if noun in sent:
                    sents.append(sent)
        return sents



[docs]
    def spans_with_common_nouns(self, word):
        # Let's look at the sentences
        spans = []
        # the "sents" property returns spans
        # spans have indices into the original string
        # where each index value represents a token
        for span in self._processed.sents:
            # go from the start to the end of each span, returning each token in the sentence
            # combine each token using join()
            for token in span:
                if word in self._word.get(token, " "):
                    spans.append(span)
        return spans



[docs]
    def dimensions(self, word, index=3):
        _spans = self.spans_with_common_nouns(word)
        _ad = {}
        for span in _spans:
            for token in span:
                if self._pos.get(token, None) == "ADJ":
                    _ad[self._word.get(token)] = _ad.get(self._word.get(token), 0) + 1
                if self._pos.get(token, None) == "ADV":
                    _ad[self._word.get(token)] = _ad.get(self._word.get(token), 0) + 1
                if self._pos.get(token, None) == "VERB":
                    _ad[self._word.get(token)] = _ad.get(self._word.get(token), 0) + 1
        return sorted(_ad.items(), key=operator.itemgetter(1), reverse=True)[:index]



[docs]
    def attributes(self, word, index=3):
        _spans = self.spans_with_common_nouns(word)
        _ad = {}
        for span in _spans:
            for token in span:
                if self._pos.get(token, None) == "NOUN" and word not in self._word.get(
                    token, ""
                ):
                    _ad[self._word.get(token)] = _ad.get(self._word.get(token), 0) + 1
                    # if self._pos.get(token, None) == 'VERB':
                    # _ad[self._word.get(token)] = _ad.get(self._word.get(token), 0) + 1
        return sorted(_ad.items(), key=operator.itemgetter(1), reverse=True)[:index]



[docs]
    def generate_summary(self, weight=10):
        """[summary]

        Args:
            weight (int, optional): Parameter for summary generation weight. Defaults to 10.

        Returns:
            list: A list of summary lines
        """
        words = self.common_words()
        spans = []
        ct = 0
        for key, value in words:
            ct += 1
            if ct > weight:
                continue
            for span in self.spans_with_common_nouns(key):
                spans.append(span.text)
        return list(dict.fromkeys(spans))  # remove duplicates
Source code for qrmine.content

qrmine

Navigation

Related Topics