-Added find_title method (27d54137) · Commits · MIRANET / Diploma Projects / AI Tour Guide

embedder.py

+18 −6

Original line number	Diff line number	Diff line
		@@ -29,7 +29,7 @@ class Embedder(Consts):
		else:
		raise Exception("You need to load the documents first! Use load_docs()")

		def load_docs(self, directory="aiani dedomena/*", chunking_type=None, colors=list[None]) -> None:
		def load_docs(self, directory="aiani dedomena/*", chunking_type=None, colors=None) -> None:
		"""
		Loads the pdfs in MyDoc parser and saves them in self._docs.
		Also, if specified chunks the documents in the desired method.
		@@ -43,8 +43,9 @@ class Embedder(Consts):
		# Load documents
		doc_paths = glob.glob(directory) # Load document paths
		for i in range(len(doc_paths)):
		color = colors[i] if colors else None
		doc_path = doc_paths[i]
		self._docs.append(MyDoc(doc_path, chunking_type=chunking_type, color=colors[i])) # Save docs in a list
		self._docs.append(MyDoc(doc_path, chunking_type=chunking_type, color=color)) # Save docs in a list

		# Load Chunks if specified
		if chunking_type:
		@@ -57,6 +58,16 @@ class Embedder(Consts):
		"""
		return self._docs

		def get_chunks(self) -> list:
		"""
		Gets the loaded chunks from the loaded documents
		:return: list[chunks]
		"""
		if self._chunks:
		return self._chunks
		else:
		raise Exception("You need to chunk the loaded documents first!!! Use chunk_docs() method")

		def vectorize(self):
		pass

		@@ -80,7 +91,8 @@ class Embedder(Consts):
		return True if self._docs else False


		s = MyDoc("aiani dedomena/2009-04-22-14-52-16.pdf")
		s.chunk_document(chunking_type=MyDoc.ByChar)
		s.specify_color("fdfdfdf")
		print(s.get_chunks())
		embedder = Embedder()
		embedder.load_docs(chunking_type=Embedder.ByChar)
		print(f"Documents:\n\n{embedder.get_docs()}")
		print(f"chunks:\n\n{embedder.get_chunks()}")

mydoc.py

+21 −1

Original line number	Diff line number	Diff line
		@@ -67,10 +67,27 @@ class MyDoc(Consts):

		# Basic text extraction from pdf
		text = " ".join([page.page_content for page in self._pages])
		self._title = self._find_title(text)
		text = text.replace("\n\n", " ").replace("\n", " ")
		# Title extraction
		self._text = text
		self._title = self._text[:self._text.index(" ")]

		def _find_title(self, text: str) -> str:
		"""
		Finds the title of a given text.
		:param text: The text to find its title.
		:return: The title.
		"""
		stoppers = ["\n\n", "\n", " ", "\xa0"]
		stop_index = [ind for ind in stoppers if text.find(ind)!=-1]

		if stop_index:
		title = text[:text.index(stop_index[0])]
		if len(title) > 20:
		title = text[:6]
		else:
		title = text[:10]
		return title


		def chunk_document(self, chunking_type=Consts.ByChar, color=None) -> None:
		@@ -124,5 +141,8 @@ class MyDoc(Consts):
		"""
		return True if self._chunks else False

		def __repr__(self):
		return f"Title: {self.get_title()}\n\nText: {self.get_text()}\n\n"