Commit a37436a0 authored by Yeldar Toktasynov's avatar Yeldar Toktasynov

Initial commit

parents
Pipeline #8647 failed with stage
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
image: "10.20.4.30/devops/docker-builder:19.03.5"
stages:
- test
- build
- deploy
services:
- name: docker:19.03.5-dind
command: ["--insecure-registry=10.20.4.30"]
docker build image:
stage: build
script:
- docker build .
only:
refs:
- master
changes:
- VERSION
FROM python:3.6
WORKDIR /app
COPY requirements.txt /app
RUN pip install -r requirements.txt
COPY . /app
RUN python nltk_downloader.py
CMD ["python", "app.py"]
\ No newline at end of file
# Alem Rus Posts Clustering
Clustering only rus posts
Support python version=3.6
Install requirements
```bash
pip install -r requirements.txt
```
from arase import Arase
import numpy as np
import logging
from text_preparation import Preparation
from cluster_faiss import Faiss_cluster
from datetime import datetime
from db_cluster import table_class, create_session
import uuid
import pickle as pkl
import pandas as pd
from sys import getsizeof
logging.basicConfig(format="%(asctime)s %(message)s")
app = Arase(__name__)
app.configure_from_yaml('application.yml')
text_preparation = Preparation('')
logging.info(msg=f"Начало Первого запроса в базу")
session = create_session(**app.config['postgres'])
Class_Table_Name = table_class(app.config['postgres'].get('table_name'))
df = pd.read_sql(session.query(Class_Table_Name.cluster_id, Class_Table_Name.count, Class_Table_Name.centroid)
.filter(Class_Table_Name.is_actual == True, Class_Table_Name.language == 'ru').statement,session.bind)
logging.info(msg=f"Количество кластеров в рус: {len(df)} Size: {getsizeof(df)}")
df['centroid_new'] = df['centroid'].apply(lambda x: np.array(x, dtype='float32'))
DB_CENTROID_VECTORS = pd.DataFrame(df['centroid_new'].tolist()).to_numpy()
DB_CENTROID_ID = list(df['cluster_id'])
DB_CLUSTER_COUNT = list(df['count'])
logging.info(msg=f"Конец Первого запроса в базу. РАЗМЕР ЦЕНТРОЙДОВ: {getsizeof(DB_CENTROID_VECTORS)}")
del df
def clustering_post(text):
global DB_CENTROID_ID
global DB_CENTROID_VECTORS
global DB_CLUSTER_COUNT
logging.info(msg=f"Select all modified TRUE data")
clusters_query = session.query(Class_Table_Name.centroid, Class_Table_Name.cluster_id, Class_Table_Name.count).filter(Class_Table_Name.modified == True, Class_Table_Name.language == 'ru')
for row in clusters_query:
DB_CENTROID_ID.append(row.cluster_id)
DB_CLUSTER_COUNT.append(row.count)
single_np_array = np.array(row.centroid)
single_np_array = single_np_array.astype('float32')
DB_CENTROID_VECTORS = np.append(DB_CENTROID_VECTORS, single_np_array.reshape(1, -1), axis = 0)
logging.info(msg=f"ДО РАЗМЕР центройда: {getsizeof(DB_CENTROID_VECTORS)} Длина: {len(DB_CENTROID_VECTORS)}")
clusters_length = len(DB_CENTROID_VECTORS)
DB_CENTROID_VECTORS = DB_CENTROID_VECTORS[0:clusters_length]
session.query(Class_Table_Name).filter(Class_Table_Name.cluster_id == row.cluster_id).update({'modified': False})
session.commit()
logging.info(msg=f"Modified Cluster ID: {row.cluster_id}")
logging.info(msg=f"ПОСЛЕ РАЗМЕР центройда: {getsizeof(DB_CENTROID_VECTORS)} Длина: {len(DB_CENTROID_VECTORS)}")
logging.info(msg=f"Convert text into vector")
vector_post = text_preparation.prepare_data(text)
vector_post = vector_post.astype('float32')
logging.info(msg=f"Start find similarity")
ids = Faiss_cluster.faiss_search_similarity(DB_CENTROID_VECTORS, vector_post)
logging.info(msg=f"End find similarity")
cluster_id = None
if ids != False:
cluster_id = DB_CENTROID_ID[ids]
logging.info(msg=f"Found Similar Cluster")
logging.info(msg=f"Start Compute Centroid")
new_similar_centroid = Faiss_cluster.compute_centroid(DB_CENTROID_VECTORS[ids], vector_post[0])
logging.info(msg=f"End Compute Centroid")
new_similar_centroid = new_similar_centroid.astype(float)
DB_CLUSTER_COUNT[ids] = DB_CLUSTER_COUNT[ids] + 1
logging.info(msg=f"Start Update Similar Cluster DB")
session.query(Class_Table_Name).filter(Class_Table_Name.cluster_id == cluster_id).update({'centroid': new_similar_centroid, 'count': DB_CLUSTER_COUNT[ids]})
session.commit()
DB_CENTROID_VECTORS[ids] = new_similar_centroid
logging.info(msg=f"End Update Similar Cluster DB")
elif ids == False:
logging.info(msg=f"Inside False")
new_centroid = vector_post[0].astype(float)
logging.info(msg=f"Create New Cluster")
cluster_id = uuid.uuid1()
insert_new_info = Class_Table_Name(cluster_id=cluster_id, centroid=new_centroid,
created_date=datetime.now(), last_update_date=datetime.now(),
modified = True, is_actual = True, language = 'ru', count = 0)
session.add(insert_new_info)
session.commit()
logging.info(msg=f"Insert New Cluster Into DB")
return cluster_id
@app.service("input", "output")
def service(body, message):
document_and_topics = body
document = document_and_topics.get("document")
text = document.get("text")
language = document.get("language")
logging.info(msg=f"Получен документ c id: {document.get('id')}")
if language == 'ru' and len(text) > 150:
logging.info(msg=f"Документ Написан на Русском языке")
cluster_data = clustering_post(text)
document["subjectId"] = str(cluster_data)
logging.info(msg=f"Обработан документ c id: {document.get('id')}")
logging.info(msg=f"Документу c id: {document.get('id')} присвоен кластер: {document.get('subjectId')}")
yield document_and_topics
if __name__ == '__main__':
app.run()
\ No newline at end of file
connections:
default:
host: 10.30.10.115
port: 5672
username: monorepo-migration
password: monorepo-migration
connTimeout: 1000
heartbeat: 360
consumers:
input:
connection: default
prefetch_count: 10
queue: ru-cluster
routing_key: cluster
exchange: enrichment-x-monorepo
producers:
output:
connection: default
queue: geo-classification-q-monorepo
routing_key: manual-markup-verifier-k
exchange: manual-markup-verifier-x
postgres:
username: alem
password: 71e28258b0bed7533fb1
host: 10.30.10.151
port: 5432
db_name: clusters
table_name: ru_and_kz_cluster
\ No newline at end of file
import pickle as pkl
import numpy as np
import faiss
import math
class Faiss_cluster:
@classmethod
def compute_centroid(cls, cluster_vector, vector_post):
"""
create centroid vector with new post
"""
vectors = []
vectors.append(cluster_vector)
vectors.append(vector_post)
vectors = np.asarray(vectors, dtype=np.float32)
dimension = 300
ncentroids = 1
verbose = False
kmeans = faiss.Kmeans(dimension, ncentroids, verbose=verbose)
kmeans.train(np.ascontiguousarray(vectors))
return kmeans.centroids[0]
@classmethod
def faiss_search_similarity(cls, vectors, vector_post):
"""
INDEX by faiss to quick search similar vectors
"""
vectors = vectors.astype('float32')
dimension = 300
quantiser = faiss.IndexFlatL2(dimension)
index = faiss.IndexIVFFlat(quantiser, dimension, faiss.METRIC_L2)
index.train(np.ascontiguousarray(vectors))
index.add(np.ascontiguousarray(vectors))
k = 1
D, I = index.search(np.ascontiguousarray(vector_post), k)
proba_sqrt = float(D[0][0] * 10)
if proba_sqrt < 2.5:
return I[0][0]
else:
return False
\ No newline at end of file
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, DateTime, String, ARRAY, Float, Boolean, Integer
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects.postgresql import UUID
base = declarative_base()
def create_session(username, password, host, db_name, port=None, table_name=None):
port = port or 5432
db = create_engine(f"postgresql://{username}:{password}@{host}:{port}/{db_name}")
Session = sessionmaker(db)
return Session()
def table_class(table_name):
class Cluster_db(base):
__tablename__ = table_name
cluster_id = Column(UUID(as_uuid=True), primary_key=True, index=True)
centroid = Column(ARRAY(Float))
created_date = Column(DateTime)
last_update_date = Column(DateTime)
modified = Column(Boolean, index=True)
is_actual = Column(Boolean, index=True)
language = Column(String(16000))
count = Column(Integer)
return Cluster_db
# base.metadata.create_all(db)
version: '3.6'
services:
clustering:
build: .
restart: unless-stopped
deploy:
resources:
limits:
cpus: '3'
memory: 15500M
# reservations:
# cpus: '0.5'
# memory: 2500M
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
--extra-index-url http://10.20.4.25:8080/ --trusted-host 10.20.4.25
arase==1.0.3
alem_lemmatizer==0.0.1
faiss-cpu==1.6.3
gensim==3.8.3
sqlalchemy==1.3.20
psycopg2==2.8.6
pandas==1.0.5
\ No newline at end of file
import re
import pickle as pkl
import numpy as np
from alem_lemmatizer import AlemLemmatizer
"""
векторизация word2vec очищенного текста
"""
w2v = pkl.load(open('embedding/rus_embedding.pkl', 'rb'))
class mean_vectorizer(object):
def __init__(self, word2vec):
self.word2vec = word2vec
self.dim = len(next(iter(w2v.values())))
def fit(self, X):
return self
def transform(self, X):
return np.array([
np.mean([self.word2vec[w] for w in words if w in self.word2vec]
or [np.zeros(self.dim)], axis=0)
for words in X
])
"""
подготовка текста: очистка от ненужных символов, стоп слов, лемматизация
"""
class Preparation:
def __init__(self, logger):
self.logger = logger
def cleaner(self, text):
link = r"http\S+"
bottom_space = r'_+'
without_link = re.sub(link, " ", text)
without_symbols = re.sub(r"[^\w/.-]", " ", without_link)
without_bottom_space = re.sub(bottom_space, " ", without_symbols)
without_empty_space = re.sub(r' +', ' ', without_bottom_space)
without_dash = re.sub(r'\.+', '.', without_empty_space)
clean_text = re.sub(r'\-+', '-', without_dash)
return clean_text
def prepare_data(self, data):
cleaned_text = self.cleaner(str(data))
lemmatizer = AlemLemmatizer()
lemma_text = lemmatizer.get_lemmatized_text(cleaned_text, lang='rus')
prepared_test_data = [lemma_text]
prepared_test_data_vect = mean_vectorizer(w2v).fit(prepared_test_data).transform(prepared_test_data)
return prepared_test_data_vect
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment