Loading embedder.py +108 −6 Original line number Diff line number Diff line import glob import random from typing import Any from mydoc import MyDoc Loading @@ -8,6 +9,9 @@ from dotenv import load_dotenv import os import chromadb import chromadb.utils.embedding_functions as embedding_functions import numpy as np from sklearn.manifold import TSNE import plotly.graph_objects as plt class Embedder(Consts): Loading Loading @@ -122,6 +126,23 @@ class Embedder(Consts): raise Exception("Collection does not exists!!!") return col def _random_colors_generator(self, num_colors) -> list[str]: """ Generates a list of random colors. :param num_colors: The number of colors to generate. :return: List of colors in hexadecimal format. """ colors = [] used_colors = set() for _ in range(num_colors): while True: hex_color = '#' + ''.join([random.choice('0123456789abcdef') for _ in range(6)]) if hex_color not in used_colors: used_colors.add(hex_color) colors.append(hex_color) break return colors ## CALLABLE METHODS ## def chunk_docs(self, chunking_type=None, color=None): """ Loading Loading @@ -162,16 +183,18 @@ class Embedder(Consts): raise Exception("You provided more colors than documents.") elif colors_len < doc_paths_len: raise Exception("You provided less colors than documents length.") else: colors = self._random_colors_generator(len(doc_paths)) print(f"Colors generated: {colors}") # Load documents for i in range(len(doc_paths)): color = colors[i] if colors else None color = colors[i] doc_path = doc_paths[i] self._docs.append(MyDoc(doc_path, chunking_type=chunking_type, color=color)) # Save docs in a list # Load Chunks if specified if chunking_type: self.chunk_docs(chunking_type=chunking_type) self.chunk_docs(chunking_type=chunking_type, color=color) def get_docs(self) -> list[MyDoc]: """ Loading Loading @@ -290,15 +313,94 @@ class Embedder(Consts): """ return True if self._docs else False def visualize(self, collection_name: str, dimensions=None) -> None: """ Creates a plot with matplotlib of the embeddings. :param collection_name: The name of the collection to visualize. :param dimensions: List representing the dimensions to use for the plot. Default is ['2d']. If ['2d', '3d'] is given there will be two plots. :return: None """ # Set default dimensions for visualization to 2d if dimensions is None: dimensions = ["2d"] # Extract metadata and embeddings from specified collection collection = self._get_collection_error(collection_name) results = collection.get(include=["embeddings", "metadatas"]) ids = results["ids"] embeddings = results["embeddings"] titles = [] colors = [] for i in range(len(results["metadatas"])): titles.append(results["metadatas"][i]["title"]) colors.append(results["metadatas"][i]["color"]) print(f"ids: {ids}\nembeddings: {embeddings}\ntitles: {titles}\ncolors: {colors}") # Reduce dimensionality with tsne package original_embeddings = np.array(embeddings) # For 2d if "2d" in dimensions: tsne = TSNE(n_components=2, random_state=42) reduced_vectors = tsne.fit_transform(embeddings) print(reduced_vectors) fig = plt.Figure(data=[plt.Scatter( x=reduced_vectors[:, 0], y=reduced_vectors[:, 1], mode='markers', marker=dict(size=5, color=colors, opacity=0.8), text=ids, hoverinfo='text' )]) fig.update_layout( title='2D Chroma Vector Store Visualization', scene=dict(xaxis_title='x', yaxis_title='y'), width=800, height=600, margin=dict(r=20, b=10, l=10, t=40) ) fig.show() # For 3d if "3d" in dimensions: tsne = TSNE(n_components=3, random_state=42) reduced_vectors = tsne.fit_transform(embeddings) print(reduced_vectors) fig = plt.Figure(data=[plt.Scatter3d( x=reduced_vectors[:, 0], y=reduced_vectors[:, 1], z=reduced_vectors[:, 2], mode='markers', marker=dict(size=5, color=colors, opacity=0.8), text=ids, hoverinfo='text' )]) fig.update_layout( title='3D Chroma Vector Store Visualization', scene=dict(xaxis_title='x', yaxis_title='y'), width=800, height=600, margin=dict(r=20, b=10, l=10, t=40) ) fig.show() embedder = Embedder() # embedder.load_docs(directory="aiani dedomena/*", chunking_type=Embedder.ByChar) # embedder.delete_collections("all") # print(embedder.get_chunks()) # # # print(embedder.get_chunks()) # embedder.add_data("Mycollection") print(embedder.search_similar("Mycollection", "Τι είναι η δεξαμενή?", n_results=3)) # print(embedder.search_similar("Mycollection", "Τι είναι η δεξαμενή?", n_results=3)) embedder.visualize("Mycollection", dimensions=["2d", "3d"]) mydoc.py +1 −1 Original line number Diff line number Diff line Loading @@ -93,7 +93,7 @@ class MyDoc(Consts): def chunk_document(self, chunking_type=Consts.ByChar, color=None) -> None: """ Chunks document depending the chunking type specified Chunks document depending on the chunking type specified :param chunking_type: The chunking method of the document's text. :param color: The hexadecimal code of the color metadata of the chunks. :return: None Loading Loading
embedder.py +108 −6 Original line number Diff line number Diff line import glob import random from typing import Any from mydoc import MyDoc Loading @@ -8,6 +9,9 @@ from dotenv import load_dotenv import os import chromadb import chromadb.utils.embedding_functions as embedding_functions import numpy as np from sklearn.manifold import TSNE import plotly.graph_objects as plt class Embedder(Consts): Loading Loading @@ -122,6 +126,23 @@ class Embedder(Consts): raise Exception("Collection does not exists!!!") return col def _random_colors_generator(self, num_colors) -> list[str]: """ Generates a list of random colors. :param num_colors: The number of colors to generate. :return: List of colors in hexadecimal format. """ colors = [] used_colors = set() for _ in range(num_colors): while True: hex_color = '#' + ''.join([random.choice('0123456789abcdef') for _ in range(6)]) if hex_color not in used_colors: used_colors.add(hex_color) colors.append(hex_color) break return colors ## CALLABLE METHODS ## def chunk_docs(self, chunking_type=None, color=None): """ Loading Loading @@ -162,16 +183,18 @@ class Embedder(Consts): raise Exception("You provided more colors than documents.") elif colors_len < doc_paths_len: raise Exception("You provided less colors than documents length.") else: colors = self._random_colors_generator(len(doc_paths)) print(f"Colors generated: {colors}") # Load documents for i in range(len(doc_paths)): color = colors[i] if colors else None color = colors[i] doc_path = doc_paths[i] self._docs.append(MyDoc(doc_path, chunking_type=chunking_type, color=color)) # Save docs in a list # Load Chunks if specified if chunking_type: self.chunk_docs(chunking_type=chunking_type) self.chunk_docs(chunking_type=chunking_type, color=color) def get_docs(self) -> list[MyDoc]: """ Loading Loading @@ -290,15 +313,94 @@ class Embedder(Consts): """ return True if self._docs else False def visualize(self, collection_name: str, dimensions=None) -> None: """ Creates a plot with matplotlib of the embeddings. :param collection_name: The name of the collection to visualize. :param dimensions: List representing the dimensions to use for the plot. Default is ['2d']. If ['2d', '3d'] is given there will be two plots. :return: None """ # Set default dimensions for visualization to 2d if dimensions is None: dimensions = ["2d"] # Extract metadata and embeddings from specified collection collection = self._get_collection_error(collection_name) results = collection.get(include=["embeddings", "metadatas"]) ids = results["ids"] embeddings = results["embeddings"] titles = [] colors = [] for i in range(len(results["metadatas"])): titles.append(results["metadatas"][i]["title"]) colors.append(results["metadatas"][i]["color"]) print(f"ids: {ids}\nembeddings: {embeddings}\ntitles: {titles}\ncolors: {colors}") # Reduce dimensionality with tsne package original_embeddings = np.array(embeddings) # For 2d if "2d" in dimensions: tsne = TSNE(n_components=2, random_state=42) reduced_vectors = tsne.fit_transform(embeddings) print(reduced_vectors) fig = plt.Figure(data=[plt.Scatter( x=reduced_vectors[:, 0], y=reduced_vectors[:, 1], mode='markers', marker=dict(size=5, color=colors, opacity=0.8), text=ids, hoverinfo='text' )]) fig.update_layout( title='2D Chroma Vector Store Visualization', scene=dict(xaxis_title='x', yaxis_title='y'), width=800, height=600, margin=dict(r=20, b=10, l=10, t=40) ) fig.show() # For 3d if "3d" in dimensions: tsne = TSNE(n_components=3, random_state=42) reduced_vectors = tsne.fit_transform(embeddings) print(reduced_vectors) fig = plt.Figure(data=[plt.Scatter3d( x=reduced_vectors[:, 0], y=reduced_vectors[:, 1], z=reduced_vectors[:, 2], mode='markers', marker=dict(size=5, color=colors, opacity=0.8), text=ids, hoverinfo='text' )]) fig.update_layout( title='3D Chroma Vector Store Visualization', scene=dict(xaxis_title='x', yaxis_title='y'), width=800, height=600, margin=dict(r=20, b=10, l=10, t=40) ) fig.show() embedder = Embedder() # embedder.load_docs(directory="aiani dedomena/*", chunking_type=Embedder.ByChar) # embedder.delete_collections("all") # print(embedder.get_chunks()) # # # print(embedder.get_chunks()) # embedder.add_data("Mycollection") print(embedder.search_similar("Mycollection", "Τι είναι η δεξαμενή?", n_results=3)) # print(embedder.search_similar("Mycollection", "Τι είναι η δεξαμενή?", n_results=3)) embedder.visualize("Mycollection", dimensions=["2d", "3d"])
mydoc.py +1 −1 Original line number Diff line number Diff line Loading @@ -93,7 +93,7 @@ class MyDoc(Consts): def chunk_document(self, chunking_type=Consts.ByChar, color=None) -> None: """ Chunks document depending the chunking type specified Chunks document depending on the chunking type specified :param chunking_type: The chunking method of the document's text. :param color: The hexadecimal code of the color metadata of the chunks. :return: None Loading