Source code for qrmine.readfiles

import re
import requests
from pypdf import PdfReader



[docs]
class ReadData(object):
    def __init__(self):
        self._content = ""
        self._documents = []
        self._titles = []

    # Getter must be defined first
    @property
    def content(self):
        return self._content

    @content.setter
    def content(self, content):
        self._content = content

    @property
    def documents(self):
        return self._documents

    @documents.setter
    def documents(self, documents):
        self._documents = documents

    @property
    def titles(self):
        return self._titles

    @titles.setter
    def titles(self, titles):
        self._titles = titles


[docs]
    def append(self, title, document):
        self._titles.append(title)
        self._documents.append(document)
        self._content += document



[docs]
    def read_file(self, input, comma_separated_ignore_words=None):
        # if input is a file name
        if input.endswith(".txt"):
            with open(input, "r") as f:
                read_from_file = f.read()
                # remove comma separated ignore words
                if comma_separated_ignore_words:
                    for word in comma_separated_ignore_words.split(","):
                        read_from_file = re.sub(
                            r"\b" + word.strip() + r"\b",
                            "",
                            read_from_file,
                            flags=re.IGNORECASE,
                        )
                self._content = re.sub("<[^<]+?>", "", read_from_file)
                self._documents = re.split("<break>.*?</break>", read_from_file)
                # Delete the last blank record
                del self._documents[-1]
                pattern = r"<break>(.*?)</break>"
                self._titles = re.findall(pattern, read_from_file, flags=re.DOTALL)
        # if input is a folder name
        elif input.endswith("/"):
            import os

            for file_name in os.listdir(input):
                if file_name.endswith(".txt"):
                    with open(os.path.join(input, file_name), "r") as f:
                        read_from_file = f.read()
                        # remove comma separated ignore words
                        if comma_separated_ignore_words:
                            for word in comma_separated_ignore_words.split(","):
                                read_from_file = re.sub(
                                    r"\b" + word.strip() + r"\b",
                                    "",
                                    read_from_file,
                                    flags=re.IGNORECASE,
                                )
                        self._content += read_from_file
                        self._documents.append(read_from_file)
                        self.titles.append(file_name)
                if file_name.endswith(".pdf"):
                    with open(os.path.join(input, file_name), "rb") as f:
                        reader = PdfReader(f)
                        read_from_file = ""
                        for page in reader.pages:
                            read_from_file += page.extract_text()
                        # remove comma separated ignore words
                        if comma_separated_ignore_words:
                            for word in comma_separated_ignore_words.split(","):
                                read_from_file = re.sub(
                                    r"\b" + word.strip() + r"\b", "", read_from_file, flags=re.IGNORECASE,
                                )
                        self._content += read_from_file
                        self._documents.append(read_from_file)
                        self.titles.append(file_name)
        # if input is a url
        elif input.startswith("http://") or input.startswith("https://"):
            response = requests.get(input)
            if response.status_code == 200:
                read_from_file = response.text
                # remove comma separated ignore words
                if comma_separated_ignore_words:
                    for word in comma_separated_ignore_words.split(","):
                        read_from_file = re.sub(
                            r"\b" + word.strip() + r"\b",
                            "",
                            read_from_file,
                            flags=re.IGNORECASE,
                        )
                self._content = read_from_file
                self._documents.append(read_from_file)
                self.titles.append(input)
        else:
            raise ValueError("Input must be a file name, folder name or url.")

        """
        Combine duplicate topics using Dict
        """

        doc_dict = {}
        ct3 = 0
        for t in self._titles:
            doc = doc_dict.get(t)
            if doc:
                doc_dict[t] = doc + self._documents[ct3]
            else:
                doc_dict[t] = self._documents[ct3]
            ct3 += 1
        self._titles.clear()
        self._documents.clear()
        for t in doc_dict.keys():
            self._documents.append(doc_dict.get(t))
            self._titles.append(t)
Source code for qrmine.readfiles

qrmine

Navigation

Related Topics