Commit 26e6073e authored by Stergios Papadopoulos's avatar Stergios Papadopoulos
Browse files

-Added constants class.

-Improved MyDoc class.
-Created chunk_docs and load_docs methods in Embedder class.
-Created abstractions on Embedder class.
parent 99bbcc14
Loading
Loading
Loading
Loading

constantscls.py

0 → 100644
+2 −0
Original line number Diff line number Diff line
class Consts:
    ByChar: str = "BYCHAR"
 No newline at end of file
+71 −6
Original line number Diff line number Diff line
import glob
from mydoc import MyDoc
from constantscls import Consts

class embedder:
class Embedder(Consts):

    def __init__(self):
        self._docs: list[MyDoc] = []
        self._chunks = []
        self._vectorDBs = []

    ## PRIVATE METHODS

    ## CALLABLE METHODS ##
    def chunk_docs(self, chunking_type=None, color=None):
        """
        Chunks the documents in a specified type.
        :param chunking_type: Constant from Conts class.
        :param color: color to add to the chunks if specified.
        :return: None
        """
        if self.has_loaded_docs():
            for doc in self._docs:
                if doc.has_chunks(): # Chunking has been done
                    self._chunks.extend(doc.get_chunks())
                else:
                    doc.chunk_document(chunking_type=chunking_type, color=color)
                    self._chunks.extend(doc.get_chunks())
        else:
            raise Exception("You need to load the documents first! Use load_docs()")

    def load_docs(self, directory="aiani dedomena/*", chunking_type=None, colors=list[None]) -> None:
        """
        Loads the pdfs in MyDoc parser and saves them in self._docs.
        Also, if specified chunks the documents in the desired method.
        If specified a color metadata will be added.
        :param colors: List of hexadecimal colors to add as metadata in the chunks.
        :param chunking_type: The chunking method of the documents.
        :param directory: the path of the directory where the pdfs to load are located, should "smth/*".
        :return: None
        """

        # Load documents
        doc_paths = glob.glob(directory) # Load document paths
        for i in range(len(doc_paths)):
            doc_path = doc_paths[i]
            self._docs.append(MyDoc(doc_path, chunking_type=chunking_type, color=colors[i])) # Save docs in a list

        # Load Chunks if specified
        if chunking_type:
            self.chunk_docs()

    def get_docs(self) -> list[MyDoc]:
        """
        Gets the loaded pdfs in MyDoc format
        :return: list[MyDoc]
        """
        return self._docs

    def vectorize(self):
        pass

    def add_to_vectordb(self):
        pass

    def load_docs(self):
        doc_paths = glob.glob("aiani dedomena/*")
    def delete_vectordb(self):
        pass

    def search_vectordb(self):
        pass

    def create_vector_store(self):
    def similarity_check(self):
        pass

    def has_loaded_docs(self):
        """
        Checks if documents have been loaded or not.
        :return: True if there are any documents False otherwise.
        """
        return True if self._docs else False


s = MyDoc("aiani dedomena/2009-04-22-14-52-16.pdf")
s.chunk_document(chunking_type=MyDoc.ByChar)
s.specify_color("fdfdfdf")
print(s.get_chunks())
#hjhjhj
 No newline at end of file
+36 −8
Original line number Diff line number Diff line
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from constantscls import Consts

class MyDoc:
    """PDF Parser, can chunk the document's text. Just specify the pdf path on the constructor!"""
class MyDoc(Consts):

    # CONSTANTS
    ByChar: str = "BYCHAR"
    EXCLUDE = [". \xa0 ", ".   ","1 / 3 ΣΤΩΪΚΟ ΚΤΗΡΙΟ \xa0 ", "\xa0", "\xa0\xa0\xa0",
               "\xa0\xa0 \xa0\xa0\xa0\xa0\xa0\xa0", "2 / 3 ΣΤΩΪΚΟ ΚΤΗΡΙΟ", "\xa0 \xa0 \xa0", "3 / 3"]

    def __init__(self, filepath: str):
    def __init__(self, filepath: str, chunking_type=None, color=None):
        """
        PDF Parser, can chunk the document's text. Just specify the pdf path on the constructor!
        You can also specify a chunking type so that the chunking could be done immediately.
        If you specify a color it will be added in the chunk metadata.

        :param filepath: the path of the document to load
        :param chunking_type: The chunking type. If not specified the chunking will not happen. You can do it later.
        :param color: If specified the color to add as metadata. If not you can add it later.
        """
        ### INITIALIZATION ###
        self._loader = PyPDFLoader(filepath)
        self._pages = self._loader.load()
        self._text = None
@@ -18,8 +26,12 @@ class MyDoc:
        self._chunks = None
        self._title = None

        ### ACTIONS ###
        self._load_text()  # Load text and title

        if chunking_type:
            self.chunk_document(chunking_type=chunking_type, color=color)


    def _chunk_by_char(self, chunk_size=500, chunk_overlap=20, length_function=len) -> None:
        """Chunks the text recursively. The end result is close to chunking by paragraph"""
@@ -31,7 +43,14 @@ class MyDoc:
        )

        chunks = self._text_splitter.split_text(self._text)
        self._chunks = self._text_splitter.create_documents(chunks, metadatas=[{"title": self._title} for i in range(len(chunks))])
        self._chunks = self._text_splitter.create_documents(
            chunks,
            metadatas=[
                {
                    "title": self._title,
                    "_id": f"{self._title}-{i}"
                } for i in range(len(chunks))]
        )


    def get_text(self) -> str:
@@ -45,6 +64,7 @@ class MyDoc:

    def _load_text(self):
        """Extracts the text from the pdf pages"""

        # Basic text extraction from pdf
        text = " ".join([page.page_content for page in self._pages])
        text = text.replace("\n\n", " ").replace("\n", " ")
@@ -53,7 +73,7 @@ class MyDoc:
        self._title = self._text[:self._text.index("  ")]


    def chunk_document(self, chunking_type=ByChar, color=None) -> None:
    def chunk_document(self, chunking_type=Consts.ByChar, color=None) -> None:
        """
        Chunks document depending the chunking type specified
        :param chunking_type: The chunking method of the document's text.
@@ -73,7 +93,7 @@ class MyDoc:
        Gets the chunks. Need to chunk the document first!!
        :return: list[Document]
        """
        if self._chunks:
        if self.has_chunks():
            return self._chunks
        raise Exception("You need to chunk the document first")

@@ -97,4 +117,12 @@ class MyDoc:
        else:
            raise Exception("You need to specify the color!!!")

    def has_chunks(self) -> bool:
        """
        True if chunking has been done else False
        :return: bool
        """
        return True if self._chunks else False