Loading .gitignore +2 −0 Original line number Diff line number Diff line /include/ /Lib/ /Scripts/ .env embedder.py +72 −5 Original line number Diff line number Diff line import glob from mydoc import MyDoc from constantscls import Consts from openai import OpenAI from dotenv import load_dotenv import os from pymilvus import MilvusClient class Embedder(Consts): GPTembed = "text-embedding-3-small" def __init__(self): # Load env variables load_dotenv() os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") # Initializations self._docs: list[MyDoc] = [] self._chunks = [] self._vectorDBs = [] self.client = OpenAI() ## PRIVATE METHODS def _get_embedding_gpt(self, text, dimensions=1024): """ Creates the vector using the specified embedding model. :param text: The text to embed. :param dimensions: The dimensions of the vector :return: The vector representing the given text """ return self.client.embeddings.create(input=[text], model=Embedder.GPTembed, dimensions=dimensions).data[0].embedding def _vectors_generator(self, dimensions=1024): """ A generator that returns vector representations if the loaded chunks. """ for chunk in self.get_chunks(): vector = self._get_embedding_gpt(chunk.page_content, dimensions) _id = chunk.metadata["_id"] title = chunk.metadata["title"] text = chunk.metadata["page_content"] color = chunk.metadata.get("color", "black") #TODO put the hexadecimal value if needed yield _id, vector, title, text, color ## CALLABLE METHODS ## def chunk_docs(self, chunking_type=None, color=None): Loading Loading @@ -68,8 +102,41 @@ class Embedder(Consts): else: raise Exception("You need to chunk the loaded documents first!!! Use chunk_docs() method") def vectorize(self): pass def vectorize(self, vectordb_name=None, collection_name="collection", dimensions=1024): """ Creates the VectoDB named as specified. If the name is the same as an existing vectorDB the new one will have an auto increasing number. :param dimensions: The dimensions that the vectors will be represented. :param collection_name: The name of the collection in the vector db. :param vectordb_name: The name of the vector database. Make sure to include .db in the end :return: The path of the vector DB created """ if ".db" not in vectordb_name: raise Exception("Please include the '.db' at the end of the vector database name!!!") ## TODO CREATE THE CODE FOR SAME VECTOR DB NAMES client = MilvusClient(vectordb_name) # Create the vectorDB named as... client.create_collection( collection_name=collection_name, dimension=dimensions ) # Prepare the data to for saving in the vector DB data = [] for _id, vector, title, text, color in self._vectors_generator(dimensions): data.append( { "id": _id, "vector": vector, "title": title, "text": text, "color": color, "subject": title } ) res = client.insert(collection_name=collection_name, data=data) def add_to_vectordb(self): pass Loading @@ -92,7 +159,7 @@ class Embedder(Consts): embedder = Embedder() embedder.load_docs(chunking_type=Embedder.ByChar) print(f"Documents:\n\n{embedder.get_docs()}") print(f"chunks:\n\n{embedder.get_chunks()}") embedder.load_docs(chunking_type=Embedder.ByChar, directory="aiani dedomena/2009-04-22-14-52-16.pdf") embedder.vectorize("vectordb.db") Loading
embedder.py +72 −5 Original line number Diff line number Diff line import glob from mydoc import MyDoc from constantscls import Consts from openai import OpenAI from dotenv import load_dotenv import os from pymilvus import MilvusClient class Embedder(Consts): GPTembed = "text-embedding-3-small" def __init__(self): # Load env variables load_dotenv() os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") # Initializations self._docs: list[MyDoc] = [] self._chunks = [] self._vectorDBs = [] self.client = OpenAI() ## PRIVATE METHODS def _get_embedding_gpt(self, text, dimensions=1024): """ Creates the vector using the specified embedding model. :param text: The text to embed. :param dimensions: The dimensions of the vector :return: The vector representing the given text """ return self.client.embeddings.create(input=[text], model=Embedder.GPTembed, dimensions=dimensions).data[0].embedding def _vectors_generator(self, dimensions=1024): """ A generator that returns vector representations if the loaded chunks. """ for chunk in self.get_chunks(): vector = self._get_embedding_gpt(chunk.page_content, dimensions) _id = chunk.metadata["_id"] title = chunk.metadata["title"] text = chunk.metadata["page_content"] color = chunk.metadata.get("color", "black") #TODO put the hexadecimal value if needed yield _id, vector, title, text, color ## CALLABLE METHODS ## def chunk_docs(self, chunking_type=None, color=None): Loading Loading @@ -68,8 +102,41 @@ class Embedder(Consts): else: raise Exception("You need to chunk the loaded documents first!!! Use chunk_docs() method") def vectorize(self): pass def vectorize(self, vectordb_name=None, collection_name="collection", dimensions=1024): """ Creates the VectoDB named as specified. If the name is the same as an existing vectorDB the new one will have an auto increasing number. :param dimensions: The dimensions that the vectors will be represented. :param collection_name: The name of the collection in the vector db. :param vectordb_name: The name of the vector database. Make sure to include .db in the end :return: The path of the vector DB created """ if ".db" not in vectordb_name: raise Exception("Please include the '.db' at the end of the vector database name!!!") ## TODO CREATE THE CODE FOR SAME VECTOR DB NAMES client = MilvusClient(vectordb_name) # Create the vectorDB named as... client.create_collection( collection_name=collection_name, dimension=dimensions ) # Prepare the data to for saving in the vector DB data = [] for _id, vector, title, text, color in self._vectors_generator(dimensions): data.append( { "id": _id, "vector": vector, "title": title, "text": text, "color": color, "subject": title } ) res = client.insert(collection_name=collection_name, data=data) def add_to_vectordb(self): pass Loading @@ -92,7 +159,7 @@ class Embedder(Consts): embedder = Embedder() embedder.load_docs(chunking_type=Embedder.ByChar) print(f"Documents:\n\n{embedder.get_docs()}") print(f"chunks:\n\n{embedder.get_chunks()}") embedder.load_docs(chunking_type=Embedder.ByChar, directory="aiani dedomena/2009-04-22-14-52-16.pdf") embedder.vectorize("vectordb.db")