Commit 27d54137 authored by Stergios Papadopoulos's avatar Stergios Papadopoulos
Browse files

-Added find_title method

-Added __repr__ method in MyDoc class
-Added get_chunks on Embedder
-Tested Embedder and it works
parent 26e6073e
Loading
Loading
Loading
Loading
+18 −6
Original line number Diff line number Diff line
@@ -29,7 +29,7 @@ class Embedder(Consts):
        else:
            raise Exception("You need to load the documents first! Use load_docs()")

    def load_docs(self, directory="aiani dedomena/*", chunking_type=None, colors=list[None]) -> None:
    def load_docs(self, directory="aiani dedomena/*", chunking_type=None, colors=None) -> None:
        """
        Loads the pdfs in MyDoc parser and saves them in self._docs.
        Also, if specified chunks the documents in the desired method.
@@ -43,8 +43,9 @@ class Embedder(Consts):
        # Load documents
        doc_paths = glob.glob(directory) # Load document paths
        for i in range(len(doc_paths)):
            color = colors[i] if colors else None
            doc_path = doc_paths[i]
            self._docs.append(MyDoc(doc_path, chunking_type=chunking_type, color=colors[i])) # Save docs in a list
            self._docs.append(MyDoc(doc_path, chunking_type=chunking_type, color=color)) # Save docs in a list

        # Load Chunks if specified
        if chunking_type:
@@ -57,6 +58,16 @@ class Embedder(Consts):
        """
        return self._docs

    def get_chunks(self) -> list:
        """
        Gets the loaded chunks from the loaded documents
        :return: list[chunks]
        """
        if self._chunks:
            return self._chunks
        else:
            raise Exception("You need to chunk the loaded documents first!!! Use chunk_docs() method")

    def vectorize(self):
        pass

@@ -80,7 +91,8 @@ class Embedder(Consts):
        return True if self._docs else False


s = MyDoc("aiani dedomena/2009-04-22-14-52-16.pdf")
s.chunk_document(chunking_type=MyDoc.ByChar)
s.specify_color("fdfdfdf")
print(s.get_chunks())
embedder = Embedder()
embedder.load_docs(chunking_type=Embedder.ByChar)
print(f"Documents:\n\n{embedder.get_docs()}")
print(f"chunks:\n\n{embedder.get_chunks()}")
+21 −1
Original line number Diff line number Diff line
@@ -67,10 +67,27 @@ class MyDoc(Consts):

        # Basic text extraction from pdf
        text = " ".join([page.page_content for page in self._pages])
        self._title = self._find_title(text)
        text = text.replace("\n\n", " ").replace("\n", " ")
        # Title extraction
        self._text = text
        self._title = self._text[:self._text.index("  ")]

    def _find_title(self, text: str) -> str:
        """
        Finds the title of a given text.
        :param text: The text to find its title.
        :return: The title.
        """
        stoppers = ["\n\n", "\n", "  ", "\xa0"]
        stop_index = [ind for ind in stoppers if text.find(ind)!=-1]

        if stop_index:
            title = text[:text.index(stop_index[0])]
            if len(title) > 20:
                title = text[:6]
        else:
            title = text[:10]
        return title


    def chunk_document(self, chunking_type=Consts.ByChar, color=None) -> None:
@@ -124,5 +141,8 @@ class MyDoc(Consts):
        """
        return True if self._chunks else False

    def __repr__(self):
        return f"Title: {self.get_title()}\n\nText: {self.get_text()}\n\n"