Commit e89470d3 authored by Yeldar Toktasynov's avatar Yeldar Toktasynov

Rewrited faiss with gpu version

parent 2ccb1f1a
......@@ -2,12 +2,12 @@ FROM python:3.6
WORKDIR /app
COPY requirements.txt /app
COPY pip_requirements.txt /app
RUN pip install -r requirements.txt
RUN pip install -r pip_requirements.txt
COPY . /app
RUN python nltk_downloader.py
CMD ["python", "app.py"]
\ No newline at end of file
CMD ["python", "app.py"]
......@@ -36,6 +36,7 @@ df = pd.read_sql(session.query(Class_Table_Name.cluster_id, Class_Table_Name.cou
.filter(Class_Table_Name.is_actual == True, Class_Table_Name.language == 'ru')
.filter(or_(Class_Table_Name.count != 0, and_(Class_Table_Name.count==0,
Class_Table_Name.created_date > datetime.today() - timedelta(days = 30))))
.limit(1000)
.statement,session.bind)
......@@ -56,7 +57,7 @@ def clustering_post(text):
global DB_CENTROID_VECTORS
global DB_CLUSTER_COUNT
logging.info(msg=f"Select all modified TRUE data")
# logging.info(msg=f"Select all modified TRUE data")
clusters_query = session.query(Class_Table_Name.centroid, Class_Table_Name.cluster_id, Class_Table_Name.count).filter(Class_Table_Name.modified == True, Class_Table_Name.language == 'ru')
for row in clusters_query:
......@@ -66,7 +67,7 @@ def clustering_post(text):
single_np_array = single_np_array.astype('float32')
DB_CENTROID_VECTORS = np.append(DB_CENTROID_VECTORS, single_np_array.reshape(1, -1), axis = 0)
logging.info(msg=f"ДО РАЗМЕР центройда: {getsizeof(DB_CENTROID_VECTORS)} Длина: {len(DB_CENTROID_VECTORS)}")
# logging.info(msg=f"ДО РАЗМЕР центройда: {getsizeof(DB_CENTROID_VECTORS)} Длина: {len(DB_CENTROID_VECTORS)}")
clusters_length = len(DB_CENTROID_VECTORS)
DB_CENTROID_VECTORS = DB_CENTROID_VECTORS[0:clusters_length]
......@@ -76,48 +77,48 @@ def clustering_post(text):
logging.info(msg=f"Modified Cluster ID: {row.cluster_id}")
logging.info(msg=f"ПОСЛЕ РАЗМЕР центройда: {getsizeof(DB_CENTROID_VECTORS)} Длина: {len(DB_CENTROID_VECTORS)}")
# logging.info(msg=f"ПОСЛЕ РАЗМЕР центройда: {getsizeof(DB_CENTROID_VECTORS)} Длина: {len(DB_CENTROID_VECTORS)}")
logging.info(msg=f"Convert text into vector")
# logging.info(msg=f"Convert text into vector")
vector_post = text_preparation.prepare_data(text)
vector_post = vector_post.astype('float32')
logging.info(msg=f"Start find similarity")
# logging.info(msg=f"Start find similarity")
ids = Faiss_cluster.faiss_search_similarity(DB_CENTROID_VECTORS, vector_post)
logging.info(msg=f"End find similarity")
# logging.info(msg=f"End find similarity")
cluster_id = None
if ids != False:
cluster_id = DB_CENTROID_ID[ids]
logging.info(msg=f"Found Similar Cluster")
# logging.info(msg=f"Found Similar Cluster")
logging.info(msg=f"Start Compute Centroid")
# logging.info(msg=f"Start Compute Centroid")
new_similar_centroid = Faiss_cluster.compute_centroid(DB_CENTROID_VECTORS[ids], vector_post[0])
logging.info(msg=f"End Compute Centroid")
# logging.info(msg=f"End Compute Centroid")
new_similar_centroid = new_similar_centroid.astype(float)
DB_CLUSTER_COUNT[ids] = DB_CLUSTER_COUNT[ids] + 1
logging.info(msg=f"Start Update Similar Cluster DB")
# logging.info(msg=f"Start Update Similar Cluster DB")
session.query(Class_Table_Name).filter(Class_Table_Name.cluster_id == cluster_id).update({'centroid': new_similar_centroid, 'count': DB_CLUSTER_COUNT[ids]})
session.commit()
DB_CENTROID_VECTORS[ids] = new_similar_centroid
logging.info(msg=f"End Update Similar Cluster DB")
# logging.info(msg=f"End Update Similar Cluster DB")
elif ids == False:
logging.info(msg=f"Inside False")
# logging.info(msg=f"Inside False")
new_centroid = vector_post[0].astype(float)
logging.info(msg=f"Create New Cluster")
# logging.info(msg=f"Create New Cluster")
cluster_id = uuid.uuid1()
insert_new_info = Class_Table_Name(cluster_id=cluster_id, centroid=new_centroid,
......@@ -126,7 +127,7 @@ def clustering_post(text):
session.add(insert_new_info)
session.commit()
logging.info(msg=f"Insert New Cluster Into DB")
# logging.info(msg=f"Insert New Cluster Into DB")
return cluster_id
......@@ -144,7 +145,7 @@ def service(body, message):
logging.info(msg=f"relationships документа: {document_and_topics['relationships'][0]['reasons']}")
if text != None:
if language == 'ru' and len(text) > 150 and len(text) < 5000:
logging.info(msg=f"Документ Написан на Русском языке")
# logging.info(msg=f"Документ Написан на Русском языке")
cluster_data = clustering_post(text)
......@@ -154,7 +155,7 @@ def service(body, message):
output_document["subject_id"] = str(cluster_data)
output_document['url'] = url
logging.info(msg=f"Обработан документ c url: {url}")
# logging.info(msg=f"Обработан документ c url: {url}")
logging.info(msg=f"Документу c url: {url} присвоен кластер: {output_document.get('subject_id')}")
yield output_document
......@@ -170,7 +171,7 @@ def service(body, message):
output_document["subject_id"] = str(cluster_data)
output_document['id'] = output_id
logging.info(msg=f"Обработан документ c id: {i['topicId']}")
# logging.info(msg=f"Обработан документ c id: {i['topicId']}")
logging.info(msg=f"Документу c id: {i['topicId']} присвоен кластер: {output_document.get('subject_id')}")
yield output_document
......
from arase import Arase
from doctest import FAIL_FAST
import numpy as np
import faiss
import time
d = 64 # dimension
nb = 100000 # database size
nq = 10000 # nb of queries
np.random.seed(1234) # make reproducible
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000.
xq = np.random.random((nq, d)).astype('float32')
xq[:, 0] += np.arange(nq) / 1000.
print("HUITAA")
while(True):
time.sleep(20)
res = faiss.StandardGpuResources() # use a single GPU
# build a flat (CPU) index
index_flat = faiss.IndexFlatL2(d)
# make it into a gpu index
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)
gpu_index_flat.add(xb) # add vectors to the index
print(gpu_index_flat.ntotal)
k = 4 # we want to see 4 nearest neighbors
D, I = gpu_index_flat.search(xq, k) # actual search
print(I[:5]) # neighbors of the 5 first queries
print(I[-5:]) # neighbors of the 5 last queries
......@@ -28,14 +28,21 @@ class Faiss_cluster:
"""
INDEX by faiss to quick search similar vectors
"""
vectors = vectors.astype('float32')
dimension = 300
quantiser = faiss.IndexFlatL2(dimension)
index = faiss.IndexIVFFlat(quantiser, dimension, faiss.METRIC_L2)
index.train(np.ascontiguousarray(vectors))
index.add(np.ascontiguousarray(vectors))
res = faiss.StandardGpuResources() # use a single GPU
# make it into a gpu index
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
gpu_index_flat.train(np.ascontiguousarray(vectors))
gpu_index_flat.add(np.ascontiguousarray(vectors))
k = 1
D, I = index.search(np.ascontiguousarray(vector_post), k)
D, I = gpu_index_flat.search(np.ascontiguousarray(vector_post), k)
proba_sqrt = float(D[0][0] * 10)
if proba_sqrt < 2.5:
return I[0][0]
......
This source diff could not be displayed because it is too large. You can view the blob instead.
--extra-index-url http://10.20.4.25:8080/ --trusted-host 10.20.4.25
arase==0.1.7
alem_lemmatizer==0.0.1
faiss-cpu==1.6.3
gensim==3.8.3
sqlalchemy==1.3.20
psycopg2==2.8.6
pandas==1.0.5
\ No newline at end of file
pandas==1.0.5
faiss_gpu==1.7.2
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment