Source code for qrmine.readfiles

import re


[docs] class ReadData(object): def __init__(self): self._content = "" self._documents = [] self._titles = [] # Getter must be defined first @property def content(self): return self._content @content.setter def content(self, content): self._content = content @property def documents(self): return self._documents @documents.setter def documents(self, documents): self._documents = documents @property def titles(self): return self._titles @titles.setter def titles(self, titles): self._titles = titles
[docs] def append(self, title, document): self._titles.append(title) self._documents.append(document) self._content += document
[docs] def read_file(self, file_names): if len(file_names) > 1: for file_name in file_names: with open(file_name, 'r') as f: read_from_file = f.read() self._content = re.sub('<[^<]+?>', '', read_from_file) self._documents = re.split('<break>.*?</break>', read_from_file) # Delete the last blank record del self._documents[-1] pattern = r"<break>(.*?)</break>" _title = re.findall(pattern, read_from_file, flags=re.DOTALL)[0] self._titles.append(_title) f.close() else: file_name = file_names[0] with open(file_name, 'r') as f: read_from_file = f.read() self._content = re.sub('<[^<]+?>', '', read_from_file) self._documents = re.split('<break>.*?</break>', read_from_file) # Delete the last blank record del self._documents[-1] pattern = r"<break>(.*?)</break>" self._titles = re.findall(pattern, read_from_file, flags=re.DOTALL) """ Combine duplicate topics using Dict Currently supported only for single file. """ doc_dict = {} ct3 = 0 for t in self._titles: doc = doc_dict.get(t) if doc: doc_dict[t] = doc + self._documents[ct3] else: doc_dict[t] = self._documents[ct3] ct3 += 1 self._titles.clear() self._documents.clear() for t in doc_dict.keys(): self._documents.append(doc_dict.get(t)) self._titles.append(t) f.close()