Rewrited faiss with gpu version

e89470d3 · Yeldar Toktasynov · 2ccb1f1a · e89470d3 · e89470d3 · e89470d3
Commit e89470d3 authored Feb 28, 2022 by Yeldar Toktasynov
7 changed files
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,12 +2,12 @@ FROM python:3.6

 WORKDIR /app

-COPY requirements.txt /app
+COPY pip_requirements.txt /app

-RUN pip install -r requirements.txt
+RUN pip install -r pip_requirements.txt

 COPY . /app

 RUN python nltk_downloader.py

-CMD ["python", "app.py"]
\ No newline at end of file
+CMD ["python", "app.py"]
--- a/app.py
+++ b/app.py
@@ -36,6 +36,7 @@ df = pd.read_sql(session.query(Class_Table_Name.cluster_id, Class_Table_Name.cou
 .filter(Class_Table_Name.is_actual == True, Class_Table_Name.language == 'ru')
         .filter(or_(Class_Table_Name.count != 0, and_(Class_Table_Name.count==0, 
                                                       Class_Table_Name.created_date > datetime.today() - timedelta(days = 30))))
+         .limit(1000)
         .statement,session.bind)


@@ -56,7 +57,7 @@ def clustering_post(text):
    global DB_CENTROID_VECTORS
    global DB_CLUSTER_COUNT

-    logging.info(msg=f"Select all modified TRUE data")
+    # logging.info(msg=f"Select all modified TRUE data")

    clusters_query = session.query(Class_Table_Name.centroid, Class_Table_Name.cluster_id, Class_Table_Name.count).filter(Class_Table_Name.modified == True, Class_Table_Name.language == 'ru')
    for row in clusters_query:
@@ -66,7 +67,7 @@ def clustering_post(text):
        single_np_array = single_np_array.astype('float32')
        DB_CENTROID_VECTORS = np.append(DB_CENTROID_VECTORS, single_np_array.reshape(1, -1), axis = 0)
        
-        logging.info(msg=f"ДО РАЗМЕР центройда: {getsizeof(DB_CENTROID_VECTORS)} Длина: {len(DB_CENTROID_VECTORS)}")
+        # logging.info(msg=f"ДО РАЗМЕР центройда: {getsizeof(DB_CENTROID_VECTORS)} Длина: {len(DB_CENTROID_VECTORS)}")
        
        clusters_length = len(DB_CENTROID_VECTORS)
        DB_CENTROID_VECTORS = DB_CENTROID_VECTORS[0:clusters_length]
@@ -76,48 +77,48 @@ def clustering_post(text):

        logging.info(msg=f"Modified Cluster ID: {row.cluster_id}")

-    logging.info(msg=f"ПОСЛЕ РАЗМЕР центройда: {getsizeof(DB_CENTROID_VECTORS)} Длина: {len(DB_CENTROID_VECTORS)}")
+    # logging.info(msg=f"ПОСЛЕ РАЗМЕР центройда: {getsizeof(DB_CENTROID_VECTORS)} Длина: {len(DB_CENTROID_VECTORS)}")

-    logging.info(msg=f"Convert text into vector")
+    # logging.info(msg=f"Convert text into vector")

    vector_post = text_preparation.prepare_data(text)
    vector_post = vector_post.astype('float32')

-    logging.info(msg=f"Start find similarity")
+    # logging.info(msg=f"Start find similarity")

    ids = Faiss_cluster.faiss_search_similarity(DB_CENTROID_VECTORS, vector_post)

-    logging.info(msg=f"End find similarity")
+    # logging.info(msg=f"End find similarity")

    cluster_id = None
    if ids != False:
        cluster_id = DB_CENTROID_ID[ids]

-        logging.info(msg=f"Found Similar Cluster")      
+        # logging.info(msg=f"Found Similar Cluster")      

-        logging.info(msg=f"Start Compute Centroid")
+        # logging.info(msg=f"Start Compute Centroid")

        new_similar_centroid = Faiss_cluster.compute_centroid(DB_CENTROID_VECTORS[ids], vector_post[0])

-        logging.info(msg=f"End Compute Centroid")
+        # logging.info(msg=f"End Compute Centroid")

        new_similar_centroid = new_similar_centroid.astype(float)
        DB_CLUSTER_COUNT[ids] = DB_CLUSTER_COUNT[ids] + 1

-        logging.info(msg=f"Start Update Similar Cluster DB")
+        # logging.info(msg=f"Start Update Similar Cluster DB")

        session.query(Class_Table_Name).filter(Class_Table_Name.cluster_id == cluster_id).update({'centroid': new_similar_centroid, 'count': DB_CLUSTER_COUNT[ids]})
        session.commit()
        DB_CENTROID_VECTORS[ids] = new_similar_centroid

-        logging.info(msg=f"End Update Similar Cluster DB")     
+        # logging.info(msg=f"End Update Similar Cluster DB")     

    elif ids == False:
-        logging.info(msg=f"Inside False")
+        # logging.info(msg=f"Inside False")

        new_centroid = vector_post[0].astype(float)

-        logging.info(msg=f"Create New Cluster")
+        # logging.info(msg=f"Create New Cluster")

        cluster_id = uuid.uuid1()
        insert_new_info = Class_Table_Name(cluster_id=cluster_id, centroid=new_centroid, 
@@ -126,7 +127,7 @@ def clustering_post(text):
        session.add(insert_new_info)
        session.commit()

-        logging.info(msg=f"Insert New Cluster Into DB")
+        # logging.info(msg=f"Insert New Cluster Into DB")
              
    return cluster_id

@@ -144,7 +145,7 @@ def service(body, message):
        logging.info(msg=f"relationships документа: {document_and_topics['relationships'][0]['reasons']}")
        if text != None:
            if language == 'ru' and len(text) > 150 and len(text) < 5000:
-                logging.info(msg=f"Документ Написан на Русском языке")
+                # logging.info(msg=f"Документ Написан на Русском языке")

                cluster_data = clustering_post(text)

@@ -154,7 +155,7 @@ def service(body, message):
                    output_document["subject_id"] = str(cluster_data)
                    output_document['url'] = url
                    
-                    logging.info(msg=f"Обработан документ c url: {url}")
+                    # logging.info(msg=f"Обработан документ c url: {url}")
                    logging.info(msg=f"Документу c url: {url} присвоен кластер: {output_document.get('subject_id')}")
                    
                    yield output_document
@@ -170,7 +171,7 @@ def service(body, message):
                        output_document["subject_id"] = str(cluster_data)
                        output_document['id'] = output_id
                        
-                        logging.info(msg=f"Обработан документ c id: {i['topicId']}")
+                        # logging.info(msg=f"Обработан документ c id: {i['topicId']}")
                        logging.info(msg=f"Документу c id: {i['topicId']} присвоен кластер: {output_document.get('subject_id')}")

                        yield output_document

--- a/check_arase.py
+++ b/check_arase.py
+from arase import Arase
--- a/check_faiss.py
+++ b/check_faiss.py
+from doctest import FAIL_FAST
+import numpy as np
+import faiss
+import time
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+print("HUITAA")
+while(True):
+	time.sleep(20)
+res = faiss.StandardGpuResources()  # use a single GPU
+
+# build a flat (CPU) index
+index_flat = faiss.IndexFlatL2(d)
+# make it into a gpu index
+gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)
+
+gpu_index_flat.add(xb)         # add vectors to the index
+print(gpu_index_flat.ntotal)
+
+k = 4                          # we want to see 4 nearest neighbors
+D, I = gpu_index_flat.search(xq, k)  # actual search
+print(I[:5])                   # neighbors of the 5 first queries
+print(I[-5:])                  # neighbors of the 5 last queries
--- a/cluster_faiss.py
+++ b/cluster_faiss.py
@@ -28,14 +28,21 @@ class Faiss_cluster:
        """
            INDEX by faiss to quick search similar vectors
        """
+
+
        vectors = vectors.astype('float32')
        dimension = 300
        quantiser = faiss.IndexFlatL2(dimension)  
        index = faiss.IndexIVFFlat(quantiser, dimension, faiss.METRIC_L2)
-        index.train(np.ascontiguousarray(vectors))
-        index.add(np.ascontiguousarray(vectors))
+
+        res = faiss.StandardGpuResources()  # use a single GPU
+        # make it into a gpu index
+        gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
+
+        gpu_index_flat.train(np.ascontiguousarray(vectors))
+        gpu_index_flat.add(np.ascontiguousarray(vectors))
        k = 1
-        D, I = index.search(np.ascontiguousarray(vector_post), k)
+        D, I = gpu_index_flat.search(np.ascontiguousarray(vector_post), k)
        proba_sqrt = float(D[0][0] * 10)
        if proba_sqrt < 2.5:
            return I[0][0]

--- a/local check.ipynb
+++ b/local check.ipynb
--- a/requirements.txt
+++ b/requirements.txt
 --extra-index-url http://10.20.4.25:8080/ --trusted-host 10.20.4.25 
 arase==0.1.7
 alem_lemmatizer==0.0.1
-faiss-cpu==1.6.3
 gensim==3.8.3
 sqlalchemy==1.3.20
 psycopg2==2.8.6
-pandas==1.0.5
\ No newline at end of file
+pandas==1.0.5
+faiss_gpu==1.7.2