Loading embedder.py +18 −6 Original line number Diff line number Diff line Loading @@ -29,7 +29,7 @@ class Embedder(Consts): else: raise Exception("You need to load the documents first! Use load_docs()") def load_docs(self, directory="aiani dedomena/*", chunking_type=None, colors=list[None]) -> None: def load_docs(self, directory="aiani dedomena/*", chunking_type=None, colors=None) -> None: """ Loads the pdfs in MyDoc parser and saves them in self._docs. Also, if specified chunks the documents in the desired method. Loading @@ -43,8 +43,9 @@ class Embedder(Consts): # Load documents doc_paths = glob.glob(directory) # Load document paths for i in range(len(doc_paths)): color = colors[i] if colors else None doc_path = doc_paths[i] self._docs.append(MyDoc(doc_path, chunking_type=chunking_type, color=colors[i])) # Save docs in a list self._docs.append(MyDoc(doc_path, chunking_type=chunking_type, color=color)) # Save docs in a list # Load Chunks if specified if chunking_type: Loading @@ -57,6 +58,16 @@ class Embedder(Consts): """ return self._docs def get_chunks(self) -> list: """ Gets the loaded chunks from the loaded documents :return: list[chunks] """ if self._chunks: return self._chunks else: raise Exception("You need to chunk the loaded documents first!!! Use chunk_docs() method") def vectorize(self): pass Loading @@ -80,7 +91,8 @@ class Embedder(Consts): return True if self._docs else False s = MyDoc("aiani dedomena/2009-04-22-14-52-16.pdf") s.chunk_document(chunking_type=MyDoc.ByChar) s.specify_color("fdfdfdf") print(s.get_chunks()) embedder = Embedder() embedder.load_docs(chunking_type=Embedder.ByChar) print(f"Documents:\n\n{embedder.get_docs()}") print(f"chunks:\n\n{embedder.get_chunks()}") mydoc.py +21 −1 Original line number Diff line number Diff line Loading @@ -67,10 +67,27 @@ class MyDoc(Consts): # Basic text extraction from pdf text = " ".join([page.page_content for page in self._pages]) self._title = self._find_title(text) text = text.replace("\n\n", " ").replace("\n", " ") # Title extraction self._text = text self._title = self._text[:self._text.index(" ")] def _find_title(self, text: str) -> str: """ Finds the title of a given text. :param text: The text to find its title. :return: The title. """ stoppers = ["\n\n", "\n", " ", "\xa0"] stop_index = [ind for ind in stoppers if text.find(ind)!=-1] if stop_index: title = text[:text.index(stop_index[0])] if len(title) > 20: title = text[:6] else: title = text[:10] return title def chunk_document(self, chunking_type=Consts.ByChar, color=None) -> None: Loading Loading @@ -124,5 +141,8 @@ class MyDoc(Consts): """ return True if self._chunks else False def __repr__(self): return f"Title: {self.get_title()}\n\nText: {self.get_text()}\n\n" Loading
embedder.py +18 −6 Original line number Diff line number Diff line Loading @@ -29,7 +29,7 @@ class Embedder(Consts): else: raise Exception("You need to load the documents first! Use load_docs()") def load_docs(self, directory="aiani dedomena/*", chunking_type=None, colors=list[None]) -> None: def load_docs(self, directory="aiani dedomena/*", chunking_type=None, colors=None) -> None: """ Loads the pdfs in MyDoc parser and saves them in self._docs. Also, if specified chunks the documents in the desired method. Loading @@ -43,8 +43,9 @@ class Embedder(Consts): # Load documents doc_paths = glob.glob(directory) # Load document paths for i in range(len(doc_paths)): color = colors[i] if colors else None doc_path = doc_paths[i] self._docs.append(MyDoc(doc_path, chunking_type=chunking_type, color=colors[i])) # Save docs in a list self._docs.append(MyDoc(doc_path, chunking_type=chunking_type, color=color)) # Save docs in a list # Load Chunks if specified if chunking_type: Loading @@ -57,6 +58,16 @@ class Embedder(Consts): """ return self._docs def get_chunks(self) -> list: """ Gets the loaded chunks from the loaded documents :return: list[chunks] """ if self._chunks: return self._chunks else: raise Exception("You need to chunk the loaded documents first!!! Use chunk_docs() method") def vectorize(self): pass Loading @@ -80,7 +91,8 @@ class Embedder(Consts): return True if self._docs else False s = MyDoc("aiani dedomena/2009-04-22-14-52-16.pdf") s.chunk_document(chunking_type=MyDoc.ByChar) s.specify_color("fdfdfdf") print(s.get_chunks()) embedder = Embedder() embedder.load_docs(chunking_type=Embedder.ByChar) print(f"Documents:\n\n{embedder.get_docs()}") print(f"chunks:\n\n{embedder.get_chunks()}")
mydoc.py +21 −1 Original line number Diff line number Diff line Loading @@ -67,10 +67,27 @@ class MyDoc(Consts): # Basic text extraction from pdf text = " ".join([page.page_content for page in self._pages]) self._title = self._find_title(text) text = text.replace("\n\n", " ").replace("\n", " ") # Title extraction self._text = text self._title = self._text[:self._text.index(" ")] def _find_title(self, text: str) -> str: """ Finds the title of a given text. :param text: The text to find its title. :return: The title. """ stoppers = ["\n\n", "\n", " ", "\xa0"] stop_index = [ind for ind in stoppers if text.find(ind)!=-1] if stop_index: title = text[:text.index(stop_index[0])] if len(title) > 20: title = text[:6] else: title = text[:10] return title def chunk_document(self, chunking_type=Consts.ByChar, color=None) -> None: Loading Loading @@ -124,5 +141,8 @@ class MyDoc(Consts): """ return True if self._chunks else False def __repr__(self): return f"Title: {self.get_title()}\n\nText: {self.get_text()}\n\n"