- Added functionality for showing sources of answers (improved system prompt and unused method). (89f4bea8) · Commits · MIRANET / Diploma Projects / AI Tour Guide

embedder.py

+33 −19

Original line number	Diff line number	Diff line
		@@ -57,7 +57,8 @@ class Embedder(Consts):
		title = chunk.metadata["title"]
		text = chunk.page_content
		color = chunk.metadata.get("color", "black") #TODO put the hexadecimal value if needed
		yield _id, title, text, color
		doc_path = chunk.metadata["doc_path"]
		yield _id, title, text, color, doc_path

		def _set_embedding_func(self, model="text-embedding-3-small"):
		"""
		@@ -202,11 +203,16 @@ class Embedder(Consts):
		"""
		return self._docs

		def get_chunks(self) -> list:
		def get_chunks(self, collection_name=None) -> list:
		"""
		Gets the loaded chunks from the loaded documents
		Gets the loaded chunks from the loaded documents. If a collecion_name is specified the chunks will be from that collection.
		:param collection_name: If specified the chunks of this collection will be fetched else the loaded
		:return: list[chunks]
		"""
		if collection_name:
		collection = self._get_collection_error(collection_name)
		return collection.get()

		if self._chunks:
		return self._chunks
		else:
		@@ -219,21 +225,22 @@ class Embedder(Consts):
		:param collection_name: The name of the collection to add the data.
		:return: None
		"""
		# If collection exists get it else create it.
		# If a collection exists get it else create it.
		collection = self._get_collection(collection_name)
		if not collection:
		collection = self._create_collection(collection_name, embedding_model)

		collection.add(
		documents=[text for _, _, text, _ in self._vectors_generator()],
		documents=[text for _, _, text, _, _ in self._vectors_generator()],
		metadatas=[
		{
		"id": _id,
		"title": title,
		"color": color,
		} for _id, title, text, color in self._vectors_generator()
		"doc_path": doc_path
		} for _id, title, text, color, doc_path in self._vectors_generator()
		],
		ids=[_id for _id, _, _, _ in self._vectors_generator()]
		ids=[_id for _id, _, _, _, _ in self._vectors_generator()]
		)


		@@ -265,13 +272,13 @@ class Embedder(Consts):
		return deleted_collections


		def search_similar(self, collection_name, *input_text, n_results=3) -> list[str]:
		def search_similar(self, collection_name, *input_text, n_results=3) -> tuple[Any, list[Any]]:
		"""
		Searches specified collection for similar text chunks according to given input_text.
		:param n_results: How many results to return.
		:param collection_name: The collection to search to.
		:param input_text: The text chunk/s to search for similar chunks.
		:return: list of results.
		:return: tuple of results where the first index is the text result and the second index is the doc_path of each result.
		"""

		query_text = list(input_text)
		@@ -281,10 +288,14 @@ class Embedder(Consts):
		# Get collection or raise error if it doesn't exist.
		collection = self._get_collection_error(collection_name)

		return collection.query(
		# Get results
		results = collection.query(
		query_texts=query_text,
		n_results=n_results
		)["documents"][0]
		n_results=n_results,
		include=["documents", "metadatas"]
		)

		return results["documents"][0], [item["doc_path"] for item in results["metadatas"][0]]

		def count(self, collection_name) -> int:
		"""
		@@ -397,16 +408,19 @@ class Embedder(Consts):
		return True if self._get_collection(collection_name) else False


		embedder = Embedder()
		# embedder = Embedder()
		# embedder.load_docs(directory="aiani dedomena/*", chunking_type=Embedder.ByChar)
		# embedder.delete_collections("all")
		#
		# # print(embedder.get_chunks())
		#
		#
		# embedder.delete_collections("all")
		# #
		# # # print(embedder.get_chunks())
		# embedder.add_data("Mycollection")

		print(embedder.search_similar("Mycollection", "Τι είναι η δεξαμενή?", n_results=3))

		# embedder.visualize("Mycollection", dimensions=["2d", "3d"])
		#
		# print(embedder.search_similar("Mycollection", "Τι είναι η δεξαμενή?", n_results=3))
		#
		# # embedder.visualize("Mycollection", dimensions=["2d", "3d"])

generator.py

+30 −3

Original line number	Diff line number	Diff line
		@@ -21,7 +21,10 @@ class Generator:
		self._system_prompt = ("Είσαι ένας ξεναγός του αρχαιολογικού μουσείου Αιανής που βρισκεται στην Κοζάνη (μια μικρή πόλη στην Ελλάδα). "
		"Στόχος σου είναι να απαντάς στις ερωτήσεις που κάνουν οι επισκέπτες. "
		"Για κάθε ερώτηση θα σου παρέχεται σχετικά κομμάτια κειμένου τα οποία μπορείς να συμβουλευτείς για να απαντήσεις στην ερώτηση του χρήστη."
		"Στην περίπτωση που δεν γνωρίζεις την απάντηση στην ερώτηση που έθεσε ο χρήστης πες με ευγενικό τρόπο πως δεν γνωρίζεις την απάντηση και μήπως θέλει να ρωτήσει κάτι άλλο.")
		"Στην περίπτωση που δεν γνωρίζεις την απάντηση στην ερώτηση που έθεσε ο χρήστης πες με ευγενικό τρόπο πως δεν γνωρίζεις την απάντηση και μήπως θέλει να ρωτήσει κάτι άλλο."
		"Σε κάθε κομμάτι κειμένου που σου παρέχεται θα υπάρχει και η πηγή απο την οποία προήλθε και θα αναγράφεται στο τέλος του μετά την λέξη κλειδί «Πηγή:», "
		"αν χρησιμοποιήσεις κάποια απο τα κομμάτια αυτά στο τέλος της απάντησης σου παρέθεσε της πηγές απο τα κομμάτια κειμένου που χρησιμοποίησες γράφοντας «Πηγές: (αναφορά των πηγών σε bullets)»"
		"Μην βάζεις δικές σου πηγές αλλά μόνο αυτές που αναφέρονται σε κάθε κομμάτι κειμένου μετά την λέξη κλειδή «Πηγή:»")
		self._model = "gpt-4o-mini"
		self._conversation = [
		{"role": "system", "content": self._system_prompt}
		@@ -43,8 +46,12 @@ class Generator:
		prompt = ""
		prompt += f"{question}\n\n"
		prompt += "Παρακαλώ συμβουλεύσου τα παρακάτω σχετικά με την ερώτηση κείμενα πριν απαντήσεις: \n\n"
		for chunk in self._embedder.search_similar(self._collection_name, question, n_results=self._n_results):
		prompt += chunk + "\n\n"

		similars = self._embedder.search_similar(self._collection_name, question, n_results=self._n_results)
		texts = similars[0]
		sources = similars[1]
		for chunk, source in tuple(zip(texts, sources)):
		prompt += chunk + " Πηγή: "+ source + "\n\n"

		print(prompt) # TODO Delete this line
		return prompt
		@@ -108,6 +115,23 @@ class Generator:
		"""
		return True if not self._conversation else False

		def _make_sources_links(self, text: str) -> str \| None:
		"""
		Makes the sources links that open in a new tab
		:param text: The text to make the sources links for
		"""
		if "Πηγές" in text:
		start_index = text.index("Πηγές:")
		sources_txt = text[start_index:]
		sources_list = sources_txt.split("\n")[1:]
		for source in sources_list:
		n_source = source.strip('- ').replace("\\", "/")
		# text = text.replace(source, f"<a href='file///{n_source}'>{n_source}</a>")
		text = text.replace(source, "<a href='file:///C:/Users/στεργιος/PycharmProjects/Diplomatiki/aiani dedomena/megaloi_domoi.pdf'>link</a>")
		print("sources: ", text)
		return text


		## ====== CALLABLE METHODS ====== ##
		def generate_answer(self, question, model):
		"""
		@@ -132,6 +156,9 @@ class Generator:
		for chunk in answering_fn():
		yield chunk
		answer += chunk
		# sources_answer = self._make_sources_links(answer)
		# if sources_answer:
		# yield sources_answer

		# Save only the user's question
		self._conversation[-1]["content"] = question

mydoc.py

+13 −1

Original line number	Diff line number	Diff line
		@@ -6,7 +6,10 @@ class MyDoc(Consts):

		# CONSTANTS
		EXCLUDE = [". \xa0 ", ". ","1 / 3 ΣΤΩΪΚΟ ΚΤΗΡΙΟ \xa0 ", "\xa0", "\xa0\xa0\xa0",
		"\xa0\xa0 \xa0\xa0\xa0\xa0\xa0\xa0", "2 / 3 ΣΤΩΪΚΟ ΚΤΗΡΙΟ", "\xa0 \xa0 \xa0", "3 / 3"]
		"\xa0\xa0 \xa0\xa0\xa0\xa0\xa0\xa0", "2 / 3 ΣΤΩΪΚΟ ΚΤΗΡΙΟ", "\xa0 \xa0 \xa0", "3 / 3", "1 / 1", "1 / 2",
		"2 / 2", "1 / 4", "2 / 4", "3 / 4", "4 / 4", "Δρ Γεωργία Καραμήτρου Μεντεσίδη", "Φωτογραφίες", "1 / 5",
		"2 / 5", "3 / 5", "4 / 5", "5 / 5", "1 / 7", "2 / 7", "3 / 7", "4 / 5", "5 / 7", "6 / 7", "7 / 7", "1 / 3",
		"2 / 3", "3 / 3"]

		def __init__(self, filepath: str, chunking_type=None, color=None):
		"""
		@@ -106,6 +109,7 @@ class MyDoc(Consts):
		Exception("You need to set the chunking type!!!")
		self._clear_chunks()
		self._add_title_to_chunks()
		self._add_doc_path()

		def get_chunks(self) -> list:
		"""
		@@ -152,6 +156,14 @@ class MyDoc(Consts):
		else:
		raise Exception("You need to chunk the document first!")

		def _add_doc_path(self):
		"""Adds the doc path to the chunk metadata"""
		if self._chunks:
		for chunk in self._chunks:
		chunk.metadata["doc_path"] = self._loader.file_path
		else:
		Exception("You need to chunk the document first!")

		def __repr__(self):
		return f"Title: {self.get_title()}\n\nText: {self.get_text()}\n\n"

uicontroller.py

+7 −4

Original line number	Diff line number	Diff line
		@@ -4,8 +4,8 @@ from embedder import Embedder

		class UIController:

		def __init__(self):
		self.gen = Generator(Embedder(), "Mycollection", n_results=5)
		def __init__(self, n_results=5):
		self.gen = Generator(Embedder(), "Mycollection", n_results=n_results)
		self.embedder = Embedder()

		self._prepare_embedder()
		@@ -18,6 +18,7 @@ class UIController:
		"""

		if not self.embedder.collection_exists("Mycollection"):
		print("Loading documents embeddings...")
		self.embedder.load_docs(directory="aiani dedomena/*", chunking_type=Embedder.ByChar)
		self.embedder.add_data("Mycollection")

		@@ -70,5 +71,7 @@ class UIController:
		demo.launch(share=share)


		ui = UIController()
		ui.create_ui(share=True)
		No newline at end of file
		ui = UIController(n_results=20)
		# ui.embedder.delete_collections("all")
		# ui.embedder.visualize(collection_name="Mycollection", dimensions=["2d", "3d"])
		ui.create_ui(share=False)