-Added constants class. (26e6073e) · Commits · MIRANET / Diploma Projects / AI Tour Guide

constantscls.py

0 → 100644

+2 −0

Original line number	Diff line number	Diff line
		class Consts:
		ByChar: str = "BYCHAR"
		No newline at end of file

embedder.py

+71 −6

Original line number	Diff line number	Diff line
		import glob
		from mydoc import MyDoc
		from constantscls import Consts

		class embedder:
		class Embedder(Consts):

		def __init__(self):
		self._docs: list[MyDoc] = []
		self._chunks = []
		self._vectorDBs = []

		## PRIVATE METHODS

		## CALLABLE METHODS ##
		def chunk_docs(self, chunking_type=None, color=None):
		"""
		Chunks the documents in a specified type.
		:param chunking_type: Constant from Conts class.
		:param color: color to add to the chunks if specified.
		:return: None
		"""
		if self.has_loaded_docs():
		for doc in self._docs:
		if doc.has_chunks(): # Chunking has been done
		self._chunks.extend(doc.get_chunks())
		else:
		doc.chunk_document(chunking_type=chunking_type, color=color)
		self._chunks.extend(doc.get_chunks())
		else:
		raise Exception("You need to load the documents first! Use load_docs()")

		def load_docs(self, directory="aiani dedomena/*", chunking_type=None, colors=list[None]) -> None:
		"""
		Loads the pdfs in MyDoc parser and saves them in self._docs.
		Also, if specified chunks the documents in the desired method.
		If specified a color metadata will be added.
		:param colors: List of hexadecimal colors to add as metadata in the chunks.
		:param chunking_type: The chunking method of the documents.
		:param directory: the path of the directory where the pdfs to load are located, should "smth/*".
		:return: None
		"""

		# Load documents
		doc_paths = glob.glob(directory) # Load document paths
		for i in range(len(doc_paths)):
		doc_path = doc_paths[i]
		self._docs.append(MyDoc(doc_path, chunking_type=chunking_type, color=colors[i])) # Save docs in a list

		# Load Chunks if specified
		if chunking_type:
		self.chunk_docs()

		def get_docs(self) -> list[MyDoc]:
		"""
		Gets the loaded pdfs in MyDoc format
		:return: list[MyDoc]
		"""
		return self._docs

		def vectorize(self):
		pass

		def add_to_vectordb(self):
		pass

		def load_docs(self):
		doc_paths = glob.glob("aiani dedomena/*")
		def delete_vectordb(self):
		pass

		def search_vectordb(self):
		pass

		def create_vector_store(self):
		def similarity_check(self):
		pass

		def has_loaded_docs(self):
		"""
		Checks if documents have been loaded or not.
		:return: True if there are any documents False otherwise.
		"""
		return True if self._docs else False


		s = MyDoc("aiani dedomena/2009-04-22-14-52-16.pdf")
		s.chunk_document(chunking_type=MyDoc.ByChar)
		s.specify_color("fdfdfdf")
		print(s.get_chunks())
		#hjhjhj
		No newline at end of file

mydoc.py

+36 −8

Original line number	Diff line number	Diff line
		from langchain_community.document_loaders import PyPDFLoader
		from langchain_text_splitters import RecursiveCharacterTextSplitter
		from constantscls import Consts

		class MyDoc:
		"""PDF Parser, can chunk the document's text. Just specify the pdf path on the constructor!"""
		class MyDoc(Consts):

		# CONSTANTS
		ByChar: str = "BYCHAR"
		EXCLUDE = [". \xa0 ", ". ","1 / 3 ΣΤΩΪΚΟ ΚΤΗΡΙΟ \xa0 ", "\xa0", "\xa0\xa0\xa0",
		"\xa0\xa0 \xa0\xa0\xa0\xa0\xa0\xa0", "2 / 3 ΣΤΩΪΚΟ ΚΤΗΡΙΟ", "\xa0 \xa0 \xa0", "3 / 3"]

		def __init__(self, filepath: str):
		def __init__(self, filepath: str, chunking_type=None, color=None):
		"""
		PDF Parser, can chunk the document's text. Just specify the pdf path on the constructor!
		You can also specify a chunking type so that the chunking could be done immediately.
		If you specify a color it will be added in the chunk metadata.

		:param filepath: the path of the document to load
		:param chunking_type: The chunking type. If not specified the chunking will not happen. You can do it later.
		:param color: If specified the color to add as metadata. If not you can add it later.
		"""
		### INITIALIZATION ###
		self._loader = PyPDFLoader(filepath)
		self._pages = self._loader.load()
		self._text = None
		@@ -18,8 +26,12 @@ class MyDoc:
		self._chunks = None
		self._title = None

		### ACTIONS ###
		self._load_text() # Load text and title

		if chunking_type:
		self.chunk_document(chunking_type=chunking_type, color=color)


		def _chunk_by_char(self, chunk_size=500, chunk_overlap=20, length_function=len) -> None:
		"""Chunks the text recursively. The end result is close to chunking by paragraph"""
		@@ -31,7 +43,14 @@ class MyDoc:
		)

		chunks = self._text_splitter.split_text(self._text)
		self._chunks = self._text_splitter.create_documents(chunks, metadatas=[{"title": self._title} for i in range(len(chunks))])
		self._chunks = self._text_splitter.create_documents(
		chunks,
		metadatas=[
		{
		"title": self._title,
		"_id": f"{self._title}-{i}"
		} for i in range(len(chunks))]
		)


		def get_text(self) -> str:
		@@ -45,6 +64,7 @@ class MyDoc:

		def _load_text(self):
		"""Extracts the text from the pdf pages"""

		# Basic text extraction from pdf
		text = " ".join([page.page_content for page in self._pages])
		text = text.replace("\n\n", " ").replace("\n", " ")
		@@ -53,7 +73,7 @@ class MyDoc:
		self._title = self._text[:self._text.index(" ")]


		def chunk_document(self, chunking_type=ByChar, color=None) -> None:
		def chunk_document(self, chunking_type=Consts.ByChar, color=None) -> None:
		"""
		Chunks document depending the chunking type specified
		:param chunking_type: The chunking method of the document's text.
		@@ -73,7 +93,7 @@ class MyDoc:
		Gets the chunks. Need to chunk the document first!!
		:return: list[Document]
		"""
		if self._chunks:
		if self.has_chunks():
		return self._chunks
		raise Exception("You need to chunk the document first")

		@@ -97,4 +117,12 @@ class MyDoc:
		else:
		raise Exception("You need to specify the color!!!")

		def has_chunks(self) -> bool:
		"""
		True if chunking has been done else False
		:return: bool
		"""
		return True if self._chunks else False