mydoc finished (e8e8b463) · Commits · MIRANET / Diploma Projects / AI Tour Guide

embedder.py

0 → 100644

+21 −0

Original line number	Diff line number	Diff line
		import glob
		from mydoc import MyDoc

		class embedder:

		def __init__(self):
		pass

		def load_docs(self):
		doc_paths = glob.glob("aiani dedomena/*")


		def create_vector_store(self):
		pass


		s = MyDoc("aiani dedomena/2009-04-22-14-52-16.pdf")
		s.chunk_document(chunking_type=MyDoc.ByChar)
		s.specify_color("fdfdfdf")
		print(s.get_chunks())
		No newline at end of file

mydoc.py

0 → 100644

+100 −0

Original line number	Diff line number	Diff line
		from langchain_community.document_loaders import PyPDFLoader
		from langchain_text_splitters import RecursiveCharacterTextSplitter

		class MyDoc:
		"""PDF Parser, can chunk the document's text. Just specify the pdf path on the constructor!"""

		# CONSTANTS
		ByChar: str = "BYCHAR"
		EXCLUDE = [". \xa0 ", ". ","1 / 3 ΣΤΩΪΚΟ ΚΤΗΡΙΟ \xa0 ", "\xa0", "\xa0\xa0\xa0",
		"\xa0\xa0 \xa0\xa0\xa0\xa0\xa0\xa0", "2 / 3 ΣΤΩΪΚΟ ΚΤΗΡΙΟ", "\xa0 \xa0 \xa0", "3 / 3"]

		def __init__(self, filepath: str):

		self._loader = PyPDFLoader(filepath)
		self._pages = self._loader.load()
		self._text = None
		self._text_splitter = None
		self._chunks = None
		self._title = None

		self._load_text() # Load text and title


		def _chunk_by_char(self, chunk_size=500, chunk_overlap=20, length_function=len) -> None:
		"""Chunks the text recursively. The end result is close to chunking by paragraph"""
		self._text_splitter = RecursiveCharacterTextSplitter(
		chunk_overlap=chunk_overlap,
		chunk_size=chunk_size,
		length_function=length_function,
		separators=["\n\xa0", "\n\n", "."]
		)

		chunks = self._text_splitter.split_text(self._text)
		self._chunks = self._text_splitter.create_documents(chunks, metadatas=[{"title": self._title} for i in range(len(chunks))])


		def get_text(self) -> str:
		return self._text

		def get_pages(self) -> list:
		return self._pages

		def get_title(self) -> str:
		return self._title

		def _load_text(self):
		"""Extracts the text from the pdf pages"""
		# Basic text extraction from pdf
		text = " ".join([page.page_content for page in self._pages])
		text = text.replace("\n\n", " ").replace("\n", " ")
		# Title extraction
		self._text = text
		self._title = self._text[:self._text.index(" ")]


		def chunk_document(self, chunking_type=ByChar, color=None) -> None:
		"""
		Chunks document depending the chunking type specified
		:param chunking_type: The chunking method of the document's text.
		:param color: The hexadecimal code of the color metadata of the chunks.
		:return: None
		"""
		if chunking_type == MyDoc.ByChar:
		self._chunk_by_char()
		if color:
		self.specify_color(hexadecimal_code=color)
		else:
		Exception("You need to set the chunking type!!!")
		self._clear_chunks()

		def get_chunks(self) -> list:
		"""
		Gets the chunks. Need to chunk the document first!!
		:return: list[Document]
		"""
		if self._chunks:
		return self._chunks
		raise Exception("You need to chunk the document first")

		def _clear_chunks(self):
		"""Clears the chunks from unwanted characters. Should be called after the chunks have benn created!!!"""
		chunks = []
		for chunk in self._chunks:
		for item in MyDoc.EXCLUDE:
		chunk.page_content = chunk.page_content.replace(item, "")
		chunks.append(chunk)
		self._chunks = chunks

		def specify_color(self, hexadecimal_code=None):
		"""Creates a new metadata of the color on the chunks created from the instance"""
		if hexadecimal_code:
		if self._chunks:
		for chunk in self._chunks:
		chunk.metadata["color"] = hexadecimal_code
		else:
		raise Exception("You need to chunk the document first!")
		else:
		raise Exception("You need to specify the color!!!")