-Added vectorize method but i need to find another vectorDB (5dc21511) · Commits · MIRANET / Diploma Projects / AI Tour Guide

.gitignore

+2 −0

Original line number	Diff line number	Diff line
		/include/
		/Lib/
		/Scripts/
		.env

embedder.py

+72 −5

Original line number	Diff line number	Diff line
		import glob
		from mydoc import MyDoc
		from constantscls import Consts
		from openai import OpenAI
		from dotenv import load_dotenv
		import os
		from pymilvus import MilvusClient


		class Embedder(Consts):

		GPTembed = "text-embedding-3-small"
		def __init__(self):

		# Load env variables
		load_dotenv()
		os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

		# Initializations
		self._docs: list[MyDoc] = []
		self._chunks = []
		self._vectorDBs = []
		self.client = OpenAI()

		## PRIVATE METHODS
		def _get_embedding_gpt(self, text, dimensions=1024):
		"""
		Creates the vector using the specified embedding model.
		:param text: The text to embed.
		:param dimensions: The dimensions of the vector
		:return: The vector representing the given text
		"""
		return self.client.embeddings.create(input=[text], model=Embedder.GPTembed, dimensions=dimensions).data[0].embedding

		def _vectors_generator(self, dimensions=1024):
		"""
		A generator that returns vector representations if the loaded chunks.
		"""

		for chunk in self.get_chunks():
		vector = self._get_embedding_gpt(chunk.page_content, dimensions)
		_id = chunk.metadata["_id"]
		title = chunk.metadata["title"]
		text = chunk.metadata["page_content"]
		color = chunk.metadata.get("color", "black") #TODO put the hexadecimal value if needed
		yield _id, vector, title, text, color

		## CALLABLE METHODS ##
		def chunk_docs(self, chunking_type=None, color=None):
		@@ -68,8 +102,41 @@ class Embedder(Consts):
		else:
		raise Exception("You need to chunk the loaded documents first!!! Use chunk_docs() method")

		def vectorize(self):
		pass
		def vectorize(self, vectordb_name=None, collection_name="collection", dimensions=1024):
		"""
		Creates the VectoDB named as specified.
		If the name is the same as an existing vectorDB the new one will have an auto increasing number.
		:param dimensions: The dimensions that the vectors will be represented.
		:param collection_name: The name of the collection in the vector db.
		:param vectordb_name: The name of the vector database. Make sure to include .db in the end
		:return: The path of the vector DB created
		"""

		if ".db" not in vectordb_name:
		raise Exception("Please include the '.db' at the end of the vector database name!!!")

		## TODO CREATE THE CODE FOR SAME VECTOR DB NAMES

		client = MilvusClient(vectordb_name) # Create the vectorDB named as...
		client.create_collection(
		collection_name=collection_name,
		dimension=dimensions
		)

		# Prepare the data to for saving in the vector DB
		data = []
		for _id, vector, title, text, color in self._vectors_generator(dimensions):
		data.append(
		{
		"id": _id,
		"vector": vector,
		"title": title,
		"text": text,
		"color": color,
		"subject": title
		}
		)
		res = client.insert(collection_name=collection_name, data=data)

		def add_to_vectordb(self):
		pass
		@@ -92,7 +159,7 @@ class Embedder(Consts):


		embedder = Embedder()
		embedder.load_docs(chunking_type=Embedder.ByChar)
		print(f"Documents:\n\n{embedder.get_docs()}")
		print(f"chunks:\n\n{embedder.get_chunks()}")
		embedder.load_docs(chunking_type=Embedder.ByChar, directory="aiani dedomena/2009-04-22-14-52-16.pdf")
		embedder.vectorize("vectordb.db")