Commit 5dc21511 authored by Stergios Papadopoulos's avatar Stergios Papadopoulos
Browse files

-Added vectorize method but i need to find another vectorDB

_Added _get_embedding_gpt method
_Added _vectors_generator
parent 27d54137
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
/include/
/Lib/
/Scripts/
.env
+72 −5
Original line number Diff line number Diff line
import glob
from mydoc import MyDoc
from constantscls import Consts
from openai import OpenAI
from dotenv import load_dotenv
import os
from pymilvus import MilvusClient


class Embedder(Consts):

    GPTembed = "text-embedding-3-small"
    def __init__(self):

        # Load env variables
        load_dotenv()
        os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

        # Initializations
        self._docs: list[MyDoc] = []
        self._chunks = []
        self._vectorDBs = []
        self.client = OpenAI()

    ## PRIVATE METHODS
    def _get_embedding_gpt(self, text, dimensions=1024):
        """
        Creates the vector using the specified embedding model.
        :param text: The text to embed.
        :param dimensions: The dimensions of the vector
        :return: The vector representing the given text
        """
        return self.client.embeddings.create(input=[text], model=Embedder.GPTembed, dimensions=dimensions).data[0].embedding

    def _vectors_generator(self, dimensions=1024):
        """
        A generator that returns vector representations if the loaded chunks.
        """

        for chunk in self.get_chunks():
            vector = self._get_embedding_gpt(chunk.page_content, dimensions)
            _id = chunk.metadata["_id"]
            title = chunk.metadata["title"]
            text  = chunk.metadata["page_content"]
            color = chunk.metadata.get("color", "black") #TODO put the hexadecimal value if needed
            yield _id, vector, title, text, color

    ## CALLABLE METHODS ##
    def chunk_docs(self, chunking_type=None, color=None):
@@ -68,8 +102,41 @@ class Embedder(Consts):
        else:
            raise Exception("You need to chunk the loaded documents first!!! Use chunk_docs() method")

    def vectorize(self):
        pass
    def vectorize(self, vectordb_name=None, collection_name="collection", dimensions=1024):
        """
        Creates the VectoDB named as specified.
        If the name is the same as an existing vectorDB the new one will have an auto increasing number.
        :param dimensions: The dimensions that the vectors will be represented.
        :param collection_name: The name of the collection in the vector db.
        :param vectordb_name: The name of the vector database. Make sure to include .db in the end
        :return: The path of the vector DB created
        """

        if ".db" not in vectordb_name:
            raise Exception("Please include the '.db' at the end of the vector database name!!!")

        ## TODO CREATE THE CODE FOR SAME VECTOR DB NAMES

        client = MilvusClient(vectordb_name) # Create the vectorDB named as...
        client.create_collection(
            collection_name=collection_name,
            dimension=dimensions
        )

        # Prepare the data to for saving in the vector DB
        data = []
        for _id, vector, title, text, color in self._vectors_generator(dimensions):
            data.append(
                {
                    "id": _id,
                    "vector": vector,
                    "title": title,
                    "text": text,
                    "color": color,
                    "subject": title
                }
            )
        res = client.insert(collection_name=collection_name, data=data)

    def add_to_vectordb(self):
        pass
@@ -92,7 +159,7 @@ class Embedder(Consts):


embedder = Embedder()
embedder.load_docs(chunking_type=Embedder.ByChar)
print(f"Documents:\n\n{embedder.get_docs()}")
print(f"chunks:\n\n{embedder.get_chunks()}")
embedder.load_docs(chunking_type=Embedder.ByChar, directory="aiani dedomena/2009-04-22-14-52-16.pdf")
embedder.vectorize("vectordb.db")