Commit d119a3cc authored by Stergios Papadopoulos's avatar Stergios Papadopoulos
Browse files

- Added embedding_model as metadata to collections.

- Fixed _get_collection and _get_collection_error so that it adds same embedding function as the one used to embed data.
- Added _add_title_to_chunks at mydoc module so that better results can be retrieved.
parent ac24d108
Loading
Loading
Loading
Loading
+18 −9
Original line number Diff line number Diff line
@@ -91,7 +91,8 @@ class Embedder(Consts):
            name=collection_name,
            embedding_function=self._embedding_method,
            metadata={
                "hnsw:space": "cosine"
                "hnsw:space": "cosine",
                "embedding_model": embedding_model
            }
        )

@@ -102,10 +103,12 @@ class Embedder(Consts):
        :return: Collection if exists else None.
        """
        try:
            embedding_function=self._embedding_method
            return self.chroma_client.get_collection(name=collection_name, embedding_function=embedding_function)
            collection = self.chroma_client.get_collection(name=collection_name)
        except:
            return None
        else:
            self._set_embedding_func(model=collection.metadata["embedding_model"])
            return self.chroma_client.get_collection(name=collection_name, embedding_function=self._embedding_method)

    def _get_collection_error(self, collection_name):
        """
@@ -113,11 +116,11 @@ class Embedder(Consts):
        :param collection_name: collection name
        :return: Collection if exists.
        """
        try:
            embedding_function = self._embedding_method
            return self.chroma_client.get_collection(name=collection_name, embedding_function=embedding_function)
        except:

        col = self._get_collection(collection_name)
        if not col:
            raise Exception("Collection does not exists!!!")
        return col

    ## CALLABLE METHODS ##
    def chunk_docs(self, chunking_type=None, color=None):
@@ -258,7 +261,7 @@ class Embedder(Consts):

        return collection.query(
            query_texts=query_text,
            n_results=n_results,
            n_results=n_results
        )

    def count(self, collection_name) -> int:
@@ -289,7 +292,13 @@ class Embedder(Consts):


embedder = Embedder()
print(embedder.search_similar("Mycollection", "Τι είναι το σπίτι με τις σκάλες?"))
# embedder.load_docs(directory="aiani dedomena/*", chunking_type=Embedder.ByChar)
# embedder.delete_collections("all")

# print(embedder.get_chunks())
# embedder.add_data("Mycollection")

print(embedder.search_similar("Mycollection", "Τι είναι η δεξαμενή?", n_results=3))


+12 −1
Original line number Diff line number Diff line
@@ -42,7 +42,8 @@ class MyDoc(Consts):
            separators=["\n\xa0", "\n\n", "."]
        )

        chunks = self._text_splitter.split_text(self._text)
        chunks: list = self._text_splitter.split_text(self._text)

        self._chunks = self._text_splitter.create_documents(
            chunks,
            metadatas=[
@@ -104,6 +105,7 @@ class MyDoc(Consts):
        else:
            Exception("You need to set the chunking type!!!")
        self._clear_chunks()
        self._add_title_to_chunks()

    def get_chunks(self) -> list:
        """
@@ -141,6 +143,15 @@ class MyDoc(Consts):
        """
        return True if self._chunks else False

    def _add_title_to_chunks(self):
        """Adds the title to the chunks text content"""
        if self._chunks:
            for chunk in self._chunks:
                # Add the title in each chunk
                chunk.page_content = f"{self._title}: {chunk.page_content}"
        else:
            raise Exception("You need to chunk the document first!")

    def __repr__(self):
        return f"Title: {self.get_title()}\n\nText: {self.get_text()}\n\n"