from arase import Arase
import numpy as np
import logging
from text_preparation import Preparation
from cluster_faiss import Faiss_cluster
from datetime import datetime, timedelta
from db_cluster import table_class, create_session
import uuid
import pandas as pd
from sys import getsizeof
from sqlalchemy import or_, and_


logging.basicConfig(format="%(asctime)s %(message)s")

"""
    библиотека Arase
"""
app = Arase(__name__)
app.read_config_yaml('application.yml')

"""
    Preparation - lang identifier, очистка, лемматизация текста
"""
text_preparation = Preparation('')

"""
    первый контрольный запрос чтобы достать все актуальные кластеры с базы и добавить в numpy array
"""

logging.info(msg=f"Начало Первого запроса в базу")

session = create_session(**app.config['postgres'])
Class_Table_Name = table_class(app.config['postgres'].get('table_name'))
df = pd.read_sql(session.query(Class_Table_Name.cluster_id, Class_Table_Name.count, Class_Table_Name.centroid)
.filter(Class_Table_Name.is_actual == True, Class_Table_Name.language == 'ru')
         .filter(or_(Class_Table_Name.count != 0, and_(Class_Table_Name.count==0, 
                                                       Class_Table_Name.created_date > datetime.today() - timedelta(days = 30))))
         .limit(1000)
         .statement,session.bind)



logging.info(msg=f"Количество кластеров в рус: {len(df)} Size: {getsizeof(df)}")

df['centroid_new'] = df['centroid'].apply(lambda x: np.array(x, dtype='float32'))
DB_CENTROID_VECTORS = pd.DataFrame(df['centroid_new'].tolist()).to_numpy()
DB_CENTROID_ID = list(df['cluster_id'])
DB_CLUSTER_COUNT = list(df['count'])

logging.info(msg=f"Конец Первого запроса в базу. РАЗМЕР ЦЕНТРОЙДОВ: {getsizeof(DB_CENTROID_VECTORS)}")

del df
 
def clustering_post(text):
    global DB_CENTROID_ID
    global DB_CENTROID_VECTORS
    global DB_CLUSTER_COUNT

    # logging.info(msg=f"Select all modified TRUE data")

    clusters_query = session.query(Class_Table_Name.centroid, Class_Table_Name.cluster_id, Class_Table_Name.count).filter(Class_Table_Name.modified == True, Class_Table_Name.language == 'ru')
    for row in clusters_query:
        DB_CENTROID_ID.append(row.cluster_id)
        DB_CLUSTER_COUNT.append(row.count)
        single_np_array = np.array(row.centroid)
        single_np_array = single_np_array.astype('float32')
        DB_CENTROID_VECTORS = np.append(DB_CENTROID_VECTORS, single_np_array.reshape(1, -1), axis = 0)
        
        # logging.info(msg=f"ДО РАЗМЕР центройда: {getsizeof(DB_CENTROID_VECTORS)} Длина: {len(DB_CENTROID_VECTORS)}")
        
        clusters_length = len(DB_CENTROID_VECTORS)
        DB_CENTROID_VECTORS = DB_CENTROID_VECTORS[0:clusters_length]

        session.query(Class_Table_Name).filter(Class_Table_Name.cluster_id == row.cluster_id).update({'modified': False})
        session.commit()

        logging.info(msg=f"Modified Cluster ID: {row.cluster_id}")

    # logging.info(msg=f"ПОСЛЕ РАЗМЕР центройда: {getsizeof(DB_CENTROID_VECTORS)} Длина: {len(DB_CENTROID_VECTORS)}")

    # logging.info(msg=f"Convert text into vector")

    vector_post = text_preparation.prepare_data(text)
    vector_post = vector_post.astype('float32')

    # logging.info(msg=f"Start find similarity")

    ids = Faiss_cluster.faiss_search_similarity(DB_CENTROID_VECTORS, vector_post)

    # logging.info(msg=f"End find similarity")

    cluster_id = None
    if ids != False:
        cluster_id = DB_CENTROID_ID[ids]

        # logging.info(msg=f"Found Similar Cluster")      

        # logging.info(msg=f"Start Compute Centroid")

        new_similar_centroid = Faiss_cluster.compute_centroid(DB_CENTROID_VECTORS[ids], vector_post[0])

        # logging.info(msg=f"End Compute Centroid")

        new_similar_centroid = new_similar_centroid.astype(float)
        DB_CLUSTER_COUNT[ids] = DB_CLUSTER_COUNT[ids] + 1

        # logging.info(msg=f"Start Update Similar Cluster DB")

        session.query(Class_Table_Name).filter(Class_Table_Name.cluster_id == cluster_id).update({'centroid': new_similar_centroid, 'count': DB_CLUSTER_COUNT[ids]})
        session.commit()
        DB_CENTROID_VECTORS[ids] = new_similar_centroid

        # logging.info(msg=f"End Update Similar Cluster DB")     

    elif ids == False:
        # logging.info(msg=f"Inside False")

        new_centroid = vector_post[0].astype(float)

        # logging.info(msg=f"Create New Cluster")

        cluster_id = uuid.uuid1()
        insert_new_info = Class_Table_Name(cluster_id=cluster_id, centroid=new_centroid, 
            created_date=datetime.now(), last_update_date=datetime.now(), 
            modified = True, is_actual = True, language = 'ru', count = 0)
        session.add(insert_new_info)
        session.commit()

        # logging.info(msg=f"Insert New Cluster Into DB")
              
    return cluster_id

@app.service("input", "output")
def service(body, message):
    document_and_topics = body
    document = document_and_topics.get("document")
    text = document.get("text")
    language = document.get("language")
    url = document.get("url")
    id = document.get("id")

    logging.info(msg=f"Получен документ c url: {document.get('url')}")
    if len(document_and_topics['relationships']) >= 1 and ('MIGRATION' not in document_and_topics['relationships'][0]['reasons'] and 'UPDATE' not in document_and_topics['relationships'][0]['reasons']):
        logging.info(msg=f"relationships документа: {document_and_topics['relationships'][0]['reasons']}")
        if text != None:
            if language == 'ru' and len(text) > 150 and len(text) < 5000:
                # logging.info(msg=f"Документ Написан на Русском языке")

                cluster_data = clustering_post(text)

                if url != None:
                    
                    output_document = {}
                    output_document["subject_id"] = str(cluster_data)
                    output_document['url'] = url
                    
                    # logging.info(msg=f"Обработан документ c url: {url}")
                    logging.info(msg=f"Документу c url: {url} присвоен кластер: {output_document.get('subject_id')}")
                    
                    yield output_document

                elif url == None:

                    relationships = document_and_topics.get('relationships')

                    for i in relationships:
                        output_id = id+str("#")+i['topicId']

                        output_document = {}
                        output_document["subject_id"] = str(cluster_data)
                        output_document['id'] = output_id
                        
                        # logging.info(msg=f"Обработан документ c id: {i['topicId']}")
                        logging.info(msg=f"Документу c id: {i['topicId']} присвоен кластер: {output_document.get('subject_id')}")

                        yield output_document

if __name__ == '__main__':
    app.run()
