Loading constantscls.py 0 → 100644 +2 −0 Original line number Diff line number Diff line class Consts: ByChar: str = "BYCHAR" No newline at end of file embedder.py +71 −6 Original line number Diff line number Diff line import glob from mydoc import MyDoc from constantscls import Consts class embedder: class Embedder(Consts): def __init__(self): self._docs: list[MyDoc] = [] self._chunks = [] self._vectorDBs = [] ## PRIVATE METHODS ## CALLABLE METHODS ## def chunk_docs(self, chunking_type=None, color=None): """ Chunks the documents in a specified type. :param chunking_type: Constant from Conts class. :param color: color to add to the chunks if specified. :return: None """ if self.has_loaded_docs(): for doc in self._docs: if doc.has_chunks(): # Chunking has been done self._chunks.extend(doc.get_chunks()) else: doc.chunk_document(chunking_type=chunking_type, color=color) self._chunks.extend(doc.get_chunks()) else: raise Exception("You need to load the documents first! Use load_docs()") def load_docs(self, directory="aiani dedomena/*", chunking_type=None, colors=list[None]) -> None: """ Loads the pdfs in MyDoc parser and saves them in self._docs. Also, if specified chunks the documents in the desired method. If specified a color metadata will be added. :param colors: List of hexadecimal colors to add as metadata in the chunks. :param chunking_type: The chunking method of the documents. :param directory: the path of the directory where the pdfs to load are located, should "smth/*". :return: None """ # Load documents doc_paths = glob.glob(directory) # Load document paths for i in range(len(doc_paths)): doc_path = doc_paths[i] self._docs.append(MyDoc(doc_path, chunking_type=chunking_type, color=colors[i])) # Save docs in a list # Load Chunks if specified if chunking_type: self.chunk_docs() def get_docs(self) -> list[MyDoc]: """ Gets the loaded pdfs in MyDoc format :return: list[MyDoc] """ return self._docs def vectorize(self): pass def add_to_vectordb(self): pass def load_docs(self): doc_paths = glob.glob("aiani dedomena/*") def delete_vectordb(self): pass def search_vectordb(self): pass def create_vector_store(self): def similarity_check(self): pass def has_loaded_docs(self): """ Checks if documents have been loaded or not. :return: True if there are any documents False otherwise. """ return True if self._docs else False s = MyDoc("aiani dedomena/2009-04-22-14-52-16.pdf") s.chunk_document(chunking_type=MyDoc.ByChar) s.specify_color("fdfdfdf") print(s.get_chunks()) #hjhjhj No newline at end of file mydoc.py +36 −8 Original line number Diff line number Diff line from langchain_community.document_loaders import PyPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from constantscls import Consts class MyDoc: """PDF Parser, can chunk the document's text. Just specify the pdf path on the constructor!""" class MyDoc(Consts): # CONSTANTS ByChar: str = "BYCHAR" EXCLUDE = [". \xa0 ", ". ","1 / 3 ΣΤΩΪΚΟ ΚΤΗΡΙΟ \xa0 ", "\xa0", "\xa0\xa0\xa0", "\xa0\xa0 \xa0\xa0\xa0\xa0\xa0\xa0", "2 / 3 ΣΤΩΪΚΟ ΚΤΗΡΙΟ", "\xa0 \xa0 \xa0", "3 / 3"] def __init__(self, filepath: str): def __init__(self, filepath: str, chunking_type=None, color=None): """ PDF Parser, can chunk the document's text. Just specify the pdf path on the constructor! You can also specify a chunking type so that the chunking could be done immediately. If you specify a color it will be added in the chunk metadata. :param filepath: the path of the document to load :param chunking_type: The chunking type. If not specified the chunking will not happen. You can do it later. :param color: If specified the color to add as metadata. If not you can add it later. """ ### INITIALIZATION ### self._loader = PyPDFLoader(filepath) self._pages = self._loader.load() self._text = None Loading @@ -18,8 +26,12 @@ class MyDoc: self._chunks = None self._title = None ### ACTIONS ### self._load_text() # Load text and title if chunking_type: self.chunk_document(chunking_type=chunking_type, color=color) def _chunk_by_char(self, chunk_size=500, chunk_overlap=20, length_function=len) -> None: """Chunks the text recursively. The end result is close to chunking by paragraph""" Loading @@ -31,7 +43,14 @@ class MyDoc: ) chunks = self._text_splitter.split_text(self._text) self._chunks = self._text_splitter.create_documents(chunks, metadatas=[{"title": self._title} for i in range(len(chunks))]) self._chunks = self._text_splitter.create_documents( chunks, metadatas=[ { "title": self._title, "_id": f"{self._title}-{i}" } for i in range(len(chunks))] ) def get_text(self) -> str: Loading @@ -45,6 +64,7 @@ class MyDoc: def _load_text(self): """Extracts the text from the pdf pages""" # Basic text extraction from pdf text = " ".join([page.page_content for page in self._pages]) text = text.replace("\n\n", " ").replace("\n", " ") Loading @@ -53,7 +73,7 @@ class MyDoc: self._title = self._text[:self._text.index(" ")] def chunk_document(self, chunking_type=ByChar, color=None) -> None: def chunk_document(self, chunking_type=Consts.ByChar, color=None) -> None: """ Chunks document depending the chunking type specified :param chunking_type: The chunking method of the document's text. Loading @@ -73,7 +93,7 @@ class MyDoc: Gets the chunks. Need to chunk the document first!! :return: list[Document] """ if self._chunks: if self.has_chunks(): return self._chunks raise Exception("You need to chunk the document first") Loading @@ -97,4 +117,12 @@ class MyDoc: else: raise Exception("You need to specify the color!!!") def has_chunks(self) -> bool: """ True if chunking has been done else False :return: bool """ return True if self._chunks else False Loading
constantscls.py 0 → 100644 +2 −0 Original line number Diff line number Diff line class Consts: ByChar: str = "BYCHAR" No newline at end of file
embedder.py +71 −6 Original line number Diff line number Diff line import glob from mydoc import MyDoc from constantscls import Consts class embedder: class Embedder(Consts): def __init__(self): self._docs: list[MyDoc] = [] self._chunks = [] self._vectorDBs = [] ## PRIVATE METHODS ## CALLABLE METHODS ## def chunk_docs(self, chunking_type=None, color=None): """ Chunks the documents in a specified type. :param chunking_type: Constant from Conts class. :param color: color to add to the chunks if specified. :return: None """ if self.has_loaded_docs(): for doc in self._docs: if doc.has_chunks(): # Chunking has been done self._chunks.extend(doc.get_chunks()) else: doc.chunk_document(chunking_type=chunking_type, color=color) self._chunks.extend(doc.get_chunks()) else: raise Exception("You need to load the documents first! Use load_docs()") def load_docs(self, directory="aiani dedomena/*", chunking_type=None, colors=list[None]) -> None: """ Loads the pdfs in MyDoc parser and saves them in self._docs. Also, if specified chunks the documents in the desired method. If specified a color metadata will be added. :param colors: List of hexadecimal colors to add as metadata in the chunks. :param chunking_type: The chunking method of the documents. :param directory: the path of the directory where the pdfs to load are located, should "smth/*". :return: None """ # Load documents doc_paths = glob.glob(directory) # Load document paths for i in range(len(doc_paths)): doc_path = doc_paths[i] self._docs.append(MyDoc(doc_path, chunking_type=chunking_type, color=colors[i])) # Save docs in a list # Load Chunks if specified if chunking_type: self.chunk_docs() def get_docs(self) -> list[MyDoc]: """ Gets the loaded pdfs in MyDoc format :return: list[MyDoc] """ return self._docs def vectorize(self): pass def add_to_vectordb(self): pass def load_docs(self): doc_paths = glob.glob("aiani dedomena/*") def delete_vectordb(self): pass def search_vectordb(self): pass def create_vector_store(self): def similarity_check(self): pass def has_loaded_docs(self): """ Checks if documents have been loaded or not. :return: True if there are any documents False otherwise. """ return True if self._docs else False s = MyDoc("aiani dedomena/2009-04-22-14-52-16.pdf") s.chunk_document(chunking_type=MyDoc.ByChar) s.specify_color("fdfdfdf") print(s.get_chunks()) #hjhjhj No newline at end of file
mydoc.py +36 −8 Original line number Diff line number Diff line from langchain_community.document_loaders import PyPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from constantscls import Consts class MyDoc: """PDF Parser, can chunk the document's text. Just specify the pdf path on the constructor!""" class MyDoc(Consts): # CONSTANTS ByChar: str = "BYCHAR" EXCLUDE = [". \xa0 ", ". ","1 / 3 ΣΤΩΪΚΟ ΚΤΗΡΙΟ \xa0 ", "\xa0", "\xa0\xa0\xa0", "\xa0\xa0 \xa0\xa0\xa0\xa0\xa0\xa0", "2 / 3 ΣΤΩΪΚΟ ΚΤΗΡΙΟ", "\xa0 \xa0 \xa0", "3 / 3"] def __init__(self, filepath: str): def __init__(self, filepath: str, chunking_type=None, color=None): """ PDF Parser, can chunk the document's text. Just specify the pdf path on the constructor! You can also specify a chunking type so that the chunking could be done immediately. If you specify a color it will be added in the chunk metadata. :param filepath: the path of the document to load :param chunking_type: The chunking type. If not specified the chunking will not happen. You can do it later. :param color: If specified the color to add as metadata. If not you can add it later. """ ### INITIALIZATION ### self._loader = PyPDFLoader(filepath) self._pages = self._loader.load() self._text = None Loading @@ -18,8 +26,12 @@ class MyDoc: self._chunks = None self._title = None ### ACTIONS ### self._load_text() # Load text and title if chunking_type: self.chunk_document(chunking_type=chunking_type, color=color) def _chunk_by_char(self, chunk_size=500, chunk_overlap=20, length_function=len) -> None: """Chunks the text recursively. The end result is close to chunking by paragraph""" Loading @@ -31,7 +43,14 @@ class MyDoc: ) chunks = self._text_splitter.split_text(self._text) self._chunks = self._text_splitter.create_documents(chunks, metadatas=[{"title": self._title} for i in range(len(chunks))]) self._chunks = self._text_splitter.create_documents( chunks, metadatas=[ { "title": self._title, "_id": f"{self._title}-{i}" } for i in range(len(chunks))] ) def get_text(self) -> str: Loading @@ -45,6 +64,7 @@ class MyDoc: def _load_text(self): """Extracts the text from the pdf pages""" # Basic text extraction from pdf text = " ".join([page.page_content for page in self._pages]) text = text.replace("\n\n", " ").replace("\n", " ") Loading @@ -53,7 +73,7 @@ class MyDoc: self._title = self._text[:self._text.index(" ")] def chunk_document(self, chunking_type=ByChar, color=None) -> None: def chunk_document(self, chunking_type=Consts.ByChar, color=None) -> None: """ Chunks document depending the chunking type specified :param chunking_type: The chunking method of the document's text. Loading @@ -73,7 +93,7 @@ class MyDoc: Gets the chunks. Need to chunk the document first!! :return: list[Document] """ if self._chunks: if self.has_chunks(): return self._chunks raise Exception("You need to chunk the document first") Loading @@ -97,4 +117,12 @@ class MyDoc: else: raise Exception("You need to specify the color!!!") def has_chunks(self) -> bool: """ True if chunking has been done else False :return: bool """ return True if self._chunks else False