Commit e8e8b463 authored by Stergios Papadopoulos's avatar Stergios Papadopoulos
Browse files

mydoc finished

parents
Loading
Loading
Loading
Loading

embedder.py

0 → 100644
+21 −0
Original line number Diff line number Diff line
import glob
from mydoc import MyDoc

class embedder:

    def __init__(self):
        pass

    def load_docs(self):
        doc_paths = glob.glob("aiani dedomena/*")
        

    def create_vector_store(self):
        pass


s = MyDoc("aiani dedomena/2009-04-22-14-52-16.pdf")
s.chunk_document(chunking_type=MyDoc.ByChar)
s.specify_color("fdfdfdf")
print(s.get_chunks())
 No newline at end of file

mydoc.py

0 → 100644
+100 −0
Original line number Diff line number Diff line
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

class MyDoc:
    """PDF Parser, can chunk the document's text. Just specify the pdf path on the constructor!"""

    # CONSTANTS
    ByChar: str = "BYCHAR"
    EXCLUDE = [". \xa0 ", ".   ","1 / 3 ΣΤΩΪΚΟ ΚΤΗΡΙΟ \xa0 ", "\xa0", "\xa0\xa0\xa0",
               "\xa0\xa0 \xa0\xa0\xa0\xa0\xa0\xa0", "2 / 3 ΣΤΩΪΚΟ ΚΤΗΡΙΟ", "\xa0 \xa0 \xa0", "3 / 3"]

    def __init__(self, filepath: str):

        self._loader = PyPDFLoader(filepath)
        self._pages = self._loader.load()
        self._text = None
        self._text_splitter = None
        self._chunks = None
        self._title = None

        self._load_text() # Load text and title


    def _chunk_by_char(self, chunk_size=500, chunk_overlap=20, length_function=len) -> None:
        """Chunks the text recursively. The end result is close to chunking by paragraph"""
        self._text_splitter = RecursiveCharacterTextSplitter(
            chunk_overlap=chunk_overlap,
            chunk_size=chunk_size,
            length_function=length_function,
            separators=["\n\xa0", "\n\n", "."]
        )

        chunks = self._text_splitter.split_text(self._text)
        self._chunks = self._text_splitter.create_documents(chunks, metadatas=[{"title": self._title} for i in range(len(chunks))])


    def get_text(self) -> str:
        return self._text

    def get_pages(self) -> list:
        return self._pages

    def get_title(self) -> str:
        return self._title

    def _load_text(self):
        """Extracts the text from the pdf pages"""
        # Basic text extraction from pdf
        text = " ".join([page.page_content for page in self._pages])
        text = text.replace("\n\n", " ").replace("\n", " ")
        # Title extraction
        self._text = text
        self._title = self._text[:self._text.index("  ")]


    def chunk_document(self, chunking_type=ByChar, color=None) -> None:
        """
        Chunks document depending the chunking type specified
        :param chunking_type: The chunking method of the document's text.
        :param color: The hexadecimal code of the color metadata of the chunks.
        :return: None
        """
        if chunking_type == MyDoc.ByChar:
            self._chunk_by_char()
            if color:
                self.specify_color(hexadecimal_code=color)
        else:
            Exception("You need to set the chunking type!!!")
        self._clear_chunks()

    def get_chunks(self) -> list:
        """
        Gets the chunks. Need to chunk the document first!!
        :return: list[Document]
        """
        if self._chunks:
            return self._chunks
        raise Exception("You need to chunk the document first")

    def _clear_chunks(self):
        """Clears the chunks from unwanted characters. Should be called after the chunks have benn created!!!"""
        chunks = []
        for chunk in self._chunks:
            for item in MyDoc.EXCLUDE:
                chunk.page_content = chunk.page_content.replace(item, "")
            chunks.append(chunk)
        self._chunks = chunks

    def specify_color(self, hexadecimal_code=None):
        """Creates a new metadata of the color on the chunks created from the instance"""
        if hexadecimal_code:
            if self._chunks:
                for chunk in self._chunks:
                    chunk.metadata["color"] = hexadecimal_code
            else:
                raise Exception("You need to chunk the document first!")
        else:
            raise Exception("You need to specify the color!!!")