Source code for qrmine.readfiles
import re
import requests
from pypdf import PdfReader
[docs]
class ReadData(object):
def __init__(self):
self._content = ""
self._documents = []
self._titles = []
# Getter must be defined first
@property
def content(self):
return self._content
@content.setter
def content(self, content):
self._content = content
@property
def documents(self):
return self._documents
@documents.setter
def documents(self, documents):
self._documents = documents
@property
def titles(self):
return self._titles
@titles.setter
def titles(self, titles):
self._titles = titles
[docs]
def append(self, title, document):
self._titles.append(title)
self._documents.append(document)
self._content += document
[docs]
def read_file(self, input, comma_separated_ignore_words=None):
# if input is a file name
if input.endswith(".txt"):
with open(input, "r") as f:
read_from_file = f.read()
# remove comma separated ignore words
if comma_separated_ignore_words:
for word in comma_separated_ignore_words.split(","):
read_from_file = re.sub(
r"\b" + word.strip() + r"\b",
"",
read_from_file,
flags=re.IGNORECASE,
)
self._content = re.sub("<[^<]+?>", "", read_from_file)
self._documents = re.split("<break>.*?</break>", read_from_file)
# Delete the last blank record
del self._documents[-1]
pattern = r"<break>(.*?)</break>"
self._titles = re.findall(pattern, read_from_file, flags=re.DOTALL)
# if input is a folder name
elif input.endswith("/"):
import os
for file_name in os.listdir(input):
if file_name.endswith(".txt"):
with open(os.path.join(input, file_name), "r") as f:
read_from_file = f.read()
# remove comma separated ignore words
if comma_separated_ignore_words:
for word in comma_separated_ignore_words.split(","):
read_from_file = re.sub(
r"\b" + word.strip() + r"\b",
"",
read_from_file,
flags=re.IGNORECASE,
)
self._content += read_from_file
self._documents.append(read_from_file)
self.titles.append(file_name)
if file_name.endswith(".pdf"):
with open(os.path.join(input, file_name), "rb") as f:
reader = PdfReader(f)
read_from_file = ""
for page in reader.pages:
read_from_file += page.extract_text()
# remove comma separated ignore words
if comma_separated_ignore_words:
for word in comma_separated_ignore_words.split(","):
read_from_file = re.sub(
r"\b" + word.strip() + r"\b", "", read_from_file, flags=re.IGNORECASE,
)
self._content += read_from_file
self._documents.append(read_from_file)
self.titles.append(file_name)
# if input is a url
elif input.startswith("http://") or input.startswith("https://"):
response = requests.get(input)
if response.status_code == 200:
read_from_file = response.text
# remove comma separated ignore words
if comma_separated_ignore_words:
for word in comma_separated_ignore_words.split(","):
read_from_file = re.sub(
r"\b" + word.strip() + r"\b",
"",
read_from_file,
flags=re.IGNORECASE,
)
self._content = read_from_file
self._documents.append(read_from_file)
self.titles.append(input)
else:
raise ValueError("Input must be a file name, folder name or url.")
"""
Combine duplicate topics using Dict
"""
doc_dict = {}
ct3 = 0
for t in self._titles:
doc = doc_dict.get(t)
if doc:
doc_dict[t] = doc + self._documents[ct3]
else:
doc_dict[t] = self._documents[ct3]
ct3 += 1
self._titles.clear()
self._documents.clear()
for t in doc_dict.keys():
self._documents.append(doc_dict.get(t))
self._titles.append(t)