Initial commit

a37436a0 · Yeldar Toktasynov · a37436a0 · a37436a0 · a37436a0 · a37436a0
Commit a37436a0 authored Jan 31, 2022 by Yeldar Toktasynov
13 changed files
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+image: "10.20.4.30/devops/docker-builder:19.03.5"
+
+stages:
+  - test
+  - build
+  - deploy
+
+services:
+  - name: docker:19.03.5-dind
+    command: ["--insecure-registry=10.20.4.30"]
+
+docker build image:
+  stage: build
+  script:
+    - docker build .
+  only:
+    refs:
+      - master
+    changes:
+      - VERSION
+
--- a/Dockerfile
+++ b/Dockerfile
+FROM python:3.6
+
+WORKDIR /app
+
+COPY requirements.txt /app
+
+RUN pip install -r requirements.txt
+
+COPY . /app
+
+RUN python nltk_downloader.py
+
+CMD ["python", "app.py"]
\ No newline at end of file
--- a/README.md
+++ b/README.md
+# Alem Rus Posts Clustering
+
+Clustering only rus posts
+
+Support python version=3.6
+
+Install requirements
+
+```bash
+pip install -r requirements.txt
+```
--- a/app.py
+++ b/app.py
+from arase import Arase
+import numpy as np
+import logging
+from text_preparation import Preparation
+from cluster_faiss import Faiss_cluster
+from datetime import datetime
+from db_cluster import table_class, create_session
+import uuid
+import pickle as pkl
+import pandas as pd
+from sys import getsizeof
+
+
+logging.basicConfig(format="%(asctime)s %(message)s")
+
+
+app = Arase(__name__)
+app.configure_from_yaml('application.yml')
+text_preparation = Preparation('')
+
+
+logging.info(msg=f"Начало Первого запроса в базу")
+
+
+session = create_session(**app.config['postgres'])
+Class_Table_Name = table_class(app.config['postgres'].get('table_name'))
+df = pd.read_sql(session.query(Class_Table_Name.cluster_id, Class_Table_Name.count, Class_Table_Name.centroid)
+    .filter(Class_Table_Name.is_actual == True, Class_Table_Name.language == 'ru').statement,session.bind)
+
+
+logging.info(msg=f"Количество кластеров в рус: {len(df)} Size: {getsizeof(df)}")
+
+
+df['centroid_new'] = df['centroid'].apply(lambda x: np.array(x, dtype='float32'))
+DB_CENTROID_VECTORS = pd.DataFrame(df['centroid_new'].tolist()).to_numpy()
+DB_CENTROID_ID = list(df['cluster_id'])
+DB_CLUSTER_COUNT = list(df['count'])
+
+
+logging.info(msg=f"Конец Первого запроса в базу. РАЗМЕР ЦЕНТРОЙДОВ: {getsizeof(DB_CENTROID_VECTORS)}")
+
+del df
+
+ 
+def clustering_post(text):
+    global DB_CENTROID_ID
+    global DB_CENTROID_VECTORS
+    global DB_CLUSTER_COUNT
+
+
+    logging.info(msg=f"Select all modified TRUE data")
+
+    clusters_query = session.query(Class_Table_Name.centroid, Class_Table_Name.cluster_id, Class_Table_Name.count).filter(Class_Table_Name.modified == True, Class_Table_Name.language == 'ru')
+    for row in clusters_query:
+        DB_CENTROID_ID.append(row.cluster_id)
+        DB_CLUSTER_COUNT.append(row.count)
+        single_np_array = np.array(row.centroid)
+        single_np_array = single_np_array.astype('float32')
+        DB_CENTROID_VECTORS = np.append(DB_CENTROID_VECTORS, single_np_array.reshape(1, -1), axis = 0)
+        
+
+        logging.info(msg=f"ДО РАЗМЕР центройда: {getsizeof(DB_CENTROID_VECTORS)} Длина: {len(DB_CENTROID_VECTORS)}")
+        
+
+        clusters_length = len(DB_CENTROID_VECTORS)
+        DB_CENTROID_VECTORS = DB_CENTROID_VECTORS[0:clusters_length]
+
+        session.query(Class_Table_Name).filter(Class_Table_Name.cluster_id == row.cluster_id).update({'modified': False})
+        session.commit()
+
+
+        logging.info(msg=f"Modified Cluster ID: {row.cluster_id}")
+
+
+    logging.info(msg=f"ПОСЛЕ РАЗМЕР центройда: {getsizeof(DB_CENTROID_VECTORS)} Длина: {len(DB_CENTROID_VECTORS)}")
+    
+
+    logging.info(msg=f"Convert text into vector")
+
+
+    vector_post = text_preparation.prepare_data(text)
+    vector_post = vector_post.astype('float32')
+
+
+    logging.info(msg=f"Start find similarity")
+
+
+    ids = Faiss_cluster.faiss_search_similarity(DB_CENTROID_VECTORS, vector_post)
+
+
+    logging.info(msg=f"End find similarity")
+
+
+    cluster_id = None
+    if ids != False:
+        cluster_id = DB_CENTROID_ID[ids]
+
+
+        logging.info(msg=f"Found Similar Cluster")
+        
+
+        logging.info(msg=f"Start Compute Centroid")
+
+
+        new_similar_centroid = Faiss_cluster.compute_centroid(DB_CENTROID_VECTORS[ids], vector_post[0])
+
+
+        logging.info(msg=f"End Compute Centroid")
+
+
+        new_similar_centroid = new_similar_centroid.astype(float)
+        DB_CLUSTER_COUNT[ids] = DB_CLUSTER_COUNT[ids] + 1
+
+
+        logging.info(msg=f"Start Update Similar Cluster DB")
+
+
+        session.query(Class_Table_Name).filter(Class_Table_Name.cluster_id == cluster_id).update({'centroid': new_similar_centroid, 'count': DB_CLUSTER_COUNT[ids]})
+        session.commit()
+        DB_CENTROID_VECTORS[ids] = new_similar_centroid
+
+
+        logging.info(msg=f"End Update Similar Cluster DB")
+        
+
+    elif ids == False:
+        logging.info(msg=f"Inside False")
+
+
+        new_centroid = vector_post[0].astype(float)
+
+
+        logging.info(msg=f"Create New Cluster")
+
+
+        cluster_id = uuid.uuid1()
+        insert_new_info = Class_Table_Name(cluster_id=cluster_id, centroid=new_centroid, 
+            created_date=datetime.now(), last_update_date=datetime.now(), 
+            modified = True, is_actual = True, language = 'ru', count = 0)
+        session.add(insert_new_info)
+        session.commit()
+
+
+        logging.info(msg=f"Insert New Cluster Into DB")
+              
+
+    return cluster_id
+
+
+@app.service("input", "output")
+def service(body, message):
+    document_and_topics = body
+    document = document_and_topics.get("document")
+    text = document.get("text")
+    language = document.get("language")
+
+    logging.info(msg=f"Получен документ c id: {document.get('id')}")
+
+
+    if language == 'ru' and len(text) > 150:
+        logging.info(msg=f"Документ Написан на Русском языке")
+
+
+        cluster_data = clustering_post(text)
+        document["subjectId"] = str(cluster_data)
+
+
+        logging.info(msg=f"Обработан документ c id: {document.get('id')}")
+        logging.info(msg=f"Документу c id: {document.get('id')} присвоен кластер: {document.get('subjectId')}")
+
+        yield document_and_topics
+    
+
+if __name__ == '__main__':
+    app.run()
\ No newline at end of file
--- a/application.yml
+++ b/application.yml
+connections:
+  default:
+    host: 10.30.10.115
+    port: 5672
+    username:  monorepo-migration
+    password:  monorepo-migration
+    connTimeout: 1000
+    heartbeat: 360
+
+consumers:
+  input:
+    connection: default
+    prefetch_count: 10
+    queue: ru-cluster
+    routing_key: cluster
+    exchange: enrichment-x-monorepo
+
+producers:
+  output:
+    connection: default
+    queue: geo-classification-q-monorepo
+    routing_key: manual-markup-verifier-k
+    exchange: manual-markup-verifier-x
+
+postgres:
+  username: alem
+  password: 71e28258b0bed7533fb1
+  host: 10.30.10.151
+  port: 5432
+  db_name: clusters
+  table_name: ru_and_kz_cluster
\ No newline at end of file
--- a/cluster_faiss.py
+++ b/cluster_faiss.py
+import pickle as pkl
+import numpy as np
+import faiss
+import math
+
+class Faiss_cluster:
+    
+    @classmethod
+    def compute_centroid(cls, cluster_vector, vector_post):
+        """
+            create centroid vector with new post
+        """
+        vectors = []
+        vectors.append(cluster_vector)
+        vectors.append(vector_post)
+        vectors = np.asarray(vectors, dtype=np.float32)
+        dimension = 300
+        ncentroids = 1
+        verbose = False
+        kmeans = faiss.Kmeans(dimension, ncentroids, verbose=verbose)
+        kmeans.train(np.ascontiguousarray(vectors))
+        
+        return kmeans.centroids[0]
+    
+    
+    @classmethod
+    def faiss_search_similarity(cls, vectors, vector_post):
+        """
+            INDEX by faiss to quick search similar vectors
+        """
+        vectors = vectors.astype('float32')
+        dimension = 300
+        quantiser = faiss.IndexFlatL2(dimension)  
+        index = faiss.IndexIVFFlat(quantiser, dimension, faiss.METRIC_L2)
+        index.train(np.ascontiguousarray(vectors))
+        index.add(np.ascontiguousarray(vectors))
+        k = 1
+        D, I = index.search(np.ascontiguousarray(vector_post), k)
+        proba_sqrt = float(D[0][0] * 10)
+        if proba_sqrt < 2.5:
+            return I[0][0]
+        else:
+            return False
\ No newline at end of file
--- a/db_cluster.py
+++ b/db_cluster.py
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy import Column, DateTime, String, ARRAY, Float, Boolean, Integer
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+from sqlalchemy.dialects.postgresql import UUID
+
+
+base = declarative_base()
+
+def create_session(username, password, host, db_name, port=None, table_name=None):
+    port = port or 5432
+    db = create_engine(f"postgresql://{username}:{password}@{host}:{port}/{db_name}")
+    Session = sessionmaker(db)
+    return Session()
+
+
+def table_class(table_name):
+    class Cluster_db(base):
+        __tablename__ = table_name
+
+        cluster_id = Column(UUID(as_uuid=True), primary_key=True, index=True)
+        centroid = Column(ARRAY(Float))
+        created_date = Column(DateTime)
+        last_update_date = Column(DateTime)
+        modified = Column(Boolean, index=True)
+        is_actual = Column(Boolean, index=True)
+        language = Column(String(16000))
+        count = Column(Integer)
+
+    return Cluster_db
+# base.metadata.create_all(db)
--- a/docker-compose.yml
+++ b/docker-compose.yml
+version: '3.6'
+services:
+    clustering:
+        build: .
+        restart: unless-stopped
+        deploy:
+          resources:
+            limits:
+              cpus: '3'
+              memory: 15500M
+#             reservations:
+#               cpus: '0.5'
+#               memory: 2500M
--- a/embedding/rus_embedding.pkl
+++ b/embedding/rus_embedding.pkl
--- a/nltk_downloader.py
+++ b/nltk_downloader.py
+import nltk
+
+nltk.download('stopwords')
+nltk.download('wordnet')
--- a/requirements.txt
+++ b/requirements.txt
+--extra-index-url http://10.20.4.25:8080/ --trusted-host 10.20.4.25 
+arase==1.0.3
+alem_lemmatizer==0.0.1
+faiss-cpu==1.6.3
+gensim==3.8.3
+sqlalchemy==1.3.20
+psycopg2==2.8.6
+pandas==1.0.5
\ No newline at end of file
--- a/text_preparation.py
+++ b/text_preparation.py
+import re
+import pickle as pkl
+import numpy as np
+
+from alem_lemmatizer import AlemLemmatizer
+
+
+""" 
+    векторизация word2vec очищенного текста
+"""
+w2v = pkl.load(open('embedding/rus_embedding.pkl', 'rb'))
+class mean_vectorizer(object):
+    def __init__(self, word2vec):
+        self.word2vec = word2vec
+        self.dim = len(next(iter(w2v.values())))
+
+    def fit(self, X):
+        return self 
+
+    def transform(self, X):
+        return np.array([
+            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
+                    or [np.zeros(self.dim)], axis=0)
+            for words in X
+        ])
+
+
+"""
+   подготовка текста: очистка от ненужных символов, стоп слов, лемматизация 
+"""
+class Preparation:
+    def __init__(self, logger):
+        self.logger = logger
+          
+    
+    def cleaner(self, text):
+        link = r"http\S+"
+        bottom_space = r'_+'
+        without_link = re.sub(link, " ", text)
+        without_symbols = re.sub(r"[^\w/.-]", " ", without_link)
+        without_bottom_space = re.sub(bottom_space, " ", without_symbols)
+        without_empty_space = re.sub(r' +', ' ', without_bottom_space)
+        without_dash = re.sub(r'\.+', '.', without_empty_space)
+        clean_text = re.sub(r'\-+', '-', without_dash)
+        
+        return clean_text
+
+
+    def prepare_data(self, data):
+        
+        cleaned_text = self.cleaner(str(data))
+
+        lemmatizer = AlemLemmatizer()
+
+        lemma_text = lemmatizer.get_lemmatized_text(cleaned_text, lang='rus')
+        
+        prepared_test_data = [lemma_text]
+        prepared_test_data_vect = mean_vectorizer(w2v).fit(prepared_test_data).transform(prepared_test_data)
+
+        return prepared_test_data_vect
\ No newline at end of file