Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import json | |
| import pandas as pd | |
| import numpy as np | |
| from transformers import AutoTokenizer, RobertaForTokenClassification | |
| from transformers import AutoTokenizer, AutoModelForTokenClassification | |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
| from json import JSONEncoder | |
| from faker import Faker | |
| from keras.utils import pad_sequences | |
| import calendar | |
| class out_json(): | |
| def __init__(self, w,l): | |
| self.word = w | |
| self.label = l | |
| class MyEncoder(json.JSONEncoder): | |
| def default(self, obj): | |
| return { | |
| 'word': obj.word, | |
| 'label': obj.label | |
| } | |
| class Model: | |
| def __init__(self): | |
| self.texto="" | |
| self.idioma="" | |
| self.modelo_ner="" | |
| self.categoria_texto="" | |
| ## | |
| ### Función que aplica el modelo e identifica su idioma | |
| ### | |
| def identificacion_idioma(self,text): | |
| self.texto=text | |
| tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection") | |
| model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection") | |
| inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt") | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| preds = torch.softmax(logits, dim=-1) | |
| id2lang = model.config.id2label | |
| vals, idxs = torch.max(preds, dim=1) | |
| #retorna el idioma con mayor porcentaje | |
| maximo=vals.max() | |
| idioma='' | |
| porcentaje=0 | |
| for k, v in zip(idxs, vals): | |
| if v.item()==maximo: | |
| idioma,porcentaje=id2lang[k.item()],v.item() | |
| if idioma=='es': | |
| self.idioma="es" | |
| self.modelo_ner='BSC-LT/roberta_model_for_anonimization' | |
| self.faker_ = Faker('es_MX') | |
| self.model = RobertaForTokenClassification.from_pretrained(self.modelo_ner) | |
| else: | |
| self.idioma="en" | |
| self.faker_ = Faker('en_US') | |
| self.modelo_ner="dayannex/distilbert-tuned-4labels" | |
| self.model = AutoModelForTokenClassification.from_pretrained(self.modelo_ner) | |
| self.categorizar_texto(self.texto) | |
| def reordenacion_tokens(self,tokens,caracter): | |
| i=0 | |
| new_tokens=[] | |
| ig_tokens=[] | |
| for token in tokens: | |
| #print('token_texto:',token,caracter) | |
| ind=len(new_tokens) | |
| if i<len(tokens): | |
| if not token.startswith(caracter): | |
| new_tokens.append(token) | |
| i=i+1 | |
| else: | |
| new_tokens[ind-1] = (new_tokens[ind-1] + token.replace(caracter,'')) | |
| ig_tokens.append(i) | |
| i=i+1 | |
| return ( | |
| new_tokens, | |
| ig_tokens | |
| ) | |
| def reordenacion_tokens_es(self,tokens,caracter): | |
| i=0 | |
| new_tokens=[] | |
| ig_tokens=[] #ignorar estos indices del array de indentificadores | |
| for token in tokens: | |
| ind=len(new_tokens) | |
| if i<len(tokens): | |
| if token.startswith(caracter): | |
| new_tokens.append(token) | |
| i=i+1 | |
| else: | |
| #if i==0: new_tokens.append(token) | |
| #else: | |
| new_tokens[ind-1] = (new_tokens[ind-1] + token.replace(caracter,'')) | |
| ig_tokens.append(i) | |
| i=i+1 | |
| return ( | |
| new_tokens, | |
| ig_tokens | |
| ) | |
| def reordenacion_identificadores(self,ig_tokens,predicted_tokens_classes): | |
| x=0 | |
| new_identificadores=[] | |
| for token in predicted_tokens_classes: | |
| if x not in ig_tokens: | |
| new_identificadores.append(token) | |
| x=x+1 | |
| else: | |
| x=x+1 | |
| return new_identificadores | |
| def salida_json(self,tokens,pre_tokens): | |
| list=[] | |
| i=0 | |
| for t in tokens: | |
| if pre_tokens[i]!='O': | |
| a = out_json(t.replace('##','').replace('Ġ','').replace('Ċ',''),pre_tokens[i].replace('▁','')) | |
| list.append(a) | |
| i=i+1 | |
| return json.dumps(list, cls=MyEncoder, ensure_ascii=False)#MyEncoder().encode(list) | |
| def tokens_identificados(self,tokens,pre_tokens): | |
| list=[] | |
| i=0 | |
| for t in tokens: | |
| if pre_tokens[i]!='O': | |
| a = t.replace('##','').replace('Ġ','').replace('Ċ','') | |
| list.append(a) | |
| i=i+1 | |
| return list | |
| def metricas_anonimizacion(self,_f,t,id): | |
| i=0 | |
| coincidencia=0 | |
| Z=['O'] | |
| _fake_filter= [x for x in _f if x not in Z] | |
| new_tokens_filter= self.tokens_identificados(t,id) | |
| for token in new_tokens_filter: | |
| if token==_fake_filter[i]: | |
| coincidencia=coincidencia+1 | |
| i=i+1 | |
| return str(coincidencia) + "/" + str(len(_fake_filter)) | |
| def salida_texto( self,tokens,pre_tokens): | |
| new_labels = [] | |
| current_word = None | |
| i=0 | |
| for token in tokens: | |
| if pre_tokens[i]=='O' or 'MISC' in pre_tokens[i]: | |
| new_labels.append(' ' +token.replace('##','').replace('Ġ','')) | |
| else: | |
| new_labels.append(' ' + pre_tokens[i]) | |
| i=i+1 | |
| a='' | |
| for i in new_labels: | |
| a = a+i | |
| return a | |
| def salida_texto_anonimizado(self, ids,pre_tokens): | |
| new_labels = [] | |
| current_word = None | |
| i=0 | |
| for identificador in pre_tokens: | |
| if identificador=='O' or 'OTH' in identificador: | |
| new_labels.append(self.tokenizer.decode(ids[i])) | |
| else: | |
| new_labels.append(' ' + identificador) | |
| i=i+1 | |
| a='' | |
| for i in new_labels: | |
| a = a+i | |
| return a | |
| def is_integer_string(self,value): | |
| try: | |
| int(value) | |
| return True | |
| except ValueError: | |
| return False | |
| def formato_salida(self,out): | |
| a="" | |
| for i in out: | |
| a = a + i.replace('▁','').replace(' ','') + ' ' | |
| return a | |
| def fake_pers(self): | |
| return self.faker_.name(self) | |
| def fake_word(self): | |
| return self.faker_.word() | |
| def fake_first_name(self): | |
| return self.faker_.first_name() | |
| def fake_last_name(self): | |
| return self.faker_.last_name() | |
| def fake_address(self): | |
| return self.faker_.address() | |
| def fake_sentence(self,n): | |
| return self.faker_.sentence(nb_words=n) | |
| def fake_text(self): | |
| return self.faker_.text() | |
| def fake_company(self): | |
| return self.faker_.company() | |
| def fake_city(self): | |
| return self.faker_.city() | |
| def get_day_of(self, month_name, year=2024): | |
| months = { | |
| 'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6, | |
| 'julio': 7, 'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12, | |
| 'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6, | |
| 'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12 | |
| } | |
| month = months[month_name] | |
| _, num_days = calendar.monthrange(year, month) | |
| return str(num_days) | |
| def reemplazo_fake(self,identificadores, new_tokens): | |
| a=['Enero','January', 'February','Febrero','Marzo','March','Abril','April','Mayo','May','Junio','June','Julio','July','Agosto','August','Septiembre','September','Octubre','October','Noviembre','November','Diciembre','December'] | |
| b=['Ene','Jan', 'Feb','Mar','Mar','Abr','Apr','May','May','Jun','Jun','Jul','Jul','Ago','Aug','Sep','Oct','Nov','Dic','Dec'] | |
| i=0 | |
| new_iden=[] | |
| for id in identificadores: | |
| if 'PER' in id: | |
| new_iden.append(self.fake_first_name()) | |
| elif 'ORG' in id: | |
| new_iden.append(self.fake_company()) | |
| elif 'LOC' in id: | |
| new_iden.append(self.fake_city()) | |
| elif 'DATE' in id: | |
| if self.is_integer_string(new_tokens[i]): | |
| match len(new_tokens[i]): | |
| case 4: | |
| new_iden.append(self.faker_.date()[:4]) | |
| case 10: | |
| new_iden.append(self.faker_.date()) | |
| case 1: | |
| new_iden.append(self.get_day_of('february')) | |
| case 2: | |
| new_iden.append(self.get_day_of('february')) | |
| case _: | |
| new_iden.append(id) | |
| else: | |
| match new_tokens[i]: | |
| case w if w in a: | |
| new_iden.append(self.faker_.month_name()) | |
| case w if w in b: | |
| new_iden.append(self.faker_.month_name()[:3]) | |
| case "-": | |
| new_iden.append("-") | |
| case ".": | |
| new_iden.append(".") | |
| case ",": | |
| new_iden.append(",") | |
| case "/": | |
| new_iden.append("/") | |
| case _: | |
| new_iden.append(id) | |
| else: | |
| new_iden.append(id) | |
| i=i+1 | |
| return new_iden | |
| ### | |
| ### Función que aplica los modelo para categorizar el texto segun su contexto | |
| ### | |
| def categorizar_texto(self,texto): | |
| name="elozano/bert-base-cased-news-category" | |
| tokenizer = AutoTokenizer.from_pretrained(name) | |
| model_ = AutoModelForSequenceClassification.from_pretrained(name) | |
| inputs_ = tokenizer(texto, padding=True, truncation=True, return_tensors="pt") | |
| with torch.no_grad(): | |
| logits = model_(**inputs_).logits | |
| preds = torch.softmax(logits, dim=-1) | |
| id2lang = model_.config.id2label | |
| vals, idxs = torch.max(preds, dim=1) | |
| #retorna el idioma con mayor porcentaje | |
| maximo=vals.max() | |
| cat='' | |
| self.categoria_texto='' | |
| porcentaje=0 | |
| for k, v in zip(idxs, vals): | |
| if v.item()==maximo: | |
| cat,porcentaje=id2lang[k.item()],v.item() | |
| self.categoria_texto=cat | |
| return cat, porcentaje | |
| ### | |
| ### Función que aplica los modelos sobre un texto | |
| ### | |
| def predict(self,etiquetas): | |
| categoria, porcentaje = self.categorizar_texto(self.texto) | |
| print(categoria, porcentaje) | |
| self.tokenizer = AutoTokenizer.from_pretrained(self.modelo_ner) | |
| inputs = self.tokenizer(self.texto, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = self.model(**inputs) | |
| logits = outputs.logits | |
| predictions = torch.argmax(logits, dim=2) | |
| predicted_token_class_ids = predictions[0].tolist() | |
| predicted_tokens_classes = [self.model.config.id2label[label_id] for label_id in predicted_token_class_ids] | |
| tokens = self.tokenizer.convert_ids_to_tokens(inputs.input_ids[0],skip_special_tokens=False)## OJO skip_special_tokens=False ojo alli esta cero y es i | |
| predicted_tokens_classes.pop(0) | |
| predicted_tokens_classes.pop(len(predicted_tokens_classes)-1) | |
| tokens.pop(0) | |
| tokens.pop(len(tokens)-1) | |
| if (self.idioma=='es'): | |
| inputs = self.tokenizer(self.texto, return_tensors="pt",max_length=512, truncation=True) | |
| with torch.no_grad(): | |
| outputs = self.model(**inputs) | |
| logits = outputs.logits | |
| predictions = torch.argmax(logits, dim=2) | |
| predicted_token_class_ids = predictions[0].tolist() | |
| predicted_tokens_classes = [self.model.config.id2label[label_id] for label_id in predicted_token_class_ids] | |
| tokens = self.tokenizer.convert_ids_to_tokens(inputs.input_ids[0]) | |
| predicted_tokens_classes.pop(0) | |
| predicted_tokens_classes.pop(len(predicted_tokens_classes)-1) | |
| tokens.pop(0) | |
| tokens.pop(len(tokens)-1) | |
| new_tokens,ig_tokens=self.reordenacion_tokens_es(tokens,'Ġ') | |
| else: | |
| inputs = self.tokenizer(self.texto, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = self.model(**inputs) | |
| logits = outputs.logits | |
| predictions = torch.argmax(logits, dim=2) | |
| predicted_token_class_ids = predictions[0].tolist() | |
| predicted_tokens_classes = [self.model.config.id2label[label_id] for label_id in predicted_token_class_ids] | |
| tokens = self.tokenizer.convert_ids_to_tokens(inputs.input_ids[0]) | |
| predicted_tokens_classes.pop(0) | |
| predicted_tokens_classes.pop(len(predicted_tokens_classes)-1) | |
| tokens.pop(0) | |
| tokens.pop(len(tokens)-1) | |
| new_tokens,ig_tokens=self.reordenacion_tokens(tokens,'#') | |
| new_identificadores = self.reordenacion_identificadores(ig_tokens,predicted_tokens_classes) | |
| out1 = self.salida_json(new_tokens,new_identificadores) | |
| if etiquetas: | |
| out2 = self.salida_texto(new_tokens,new_identificadores)#solo identificadores | |
| out3="" | |
| coincidencia="" | |
| else: | |
| #out2 = self.salida_texto(new_tokens,self.reemplazo_fake(new_identificadores)) | |
| _fake=self.reemplazo_fake(new_identificadores,new_tokens) | |
| coincidencia=self.metricas_anonimizacion(_fake,new_tokens,new_identificadores) | |
| out2 = self.salida_texto(new_tokens,_fake) | |
| out3 = self.salida_json(_fake,new_identificadores) | |
| return ( | |
| out1, | |
| str(out2), | |
| out3, | |
| coincidencia | |
| ) | |
| class ModeloDataset: | |
| def __init__(self): | |
| self.texto="" | |
| self.idioma="" | |
| self.modelo_ner="" | |
| self.categoria_texto="" | |
| #self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization") | |
| def reordenacion_tokens(self,tokens,caracter): | |
| i=0 | |
| new_tokens=[] | |
| ig_tokens=[] | |
| for token in tokens: | |
| ind=len(new_tokens) | |
| if i<len(tokens): | |
| if not token.startswith(caracter): | |
| new_tokens.append(token) | |
| i=i+1 | |
| else: | |
| new_tokens[ind-1] = (new_tokens[ind-1] + token.replace(caracter,'')) | |
| ig_tokens.append(i) | |
| i=i+1 | |
| return ( | |
| new_tokens, | |
| ig_tokens | |
| ) | |
| def reordenacion_tokens_es(self,tokens,caracter): | |
| i=0 | |
| new_tokens=[] | |
| ig_tokens=[] #ignorar estos indices del array de indentificadores | |
| for token in tokens: | |
| ind=len(new_tokens) | |
| if i<len(tokens): | |
| if token.startswith(caracter): | |
| new_tokens.append(token) | |
| i=i+1 | |
| else: | |
| #if i==0: new_tokens.append(token) | |
| #else: | |
| new_tokens[ind-1] = (new_tokens[ind-1] + token.replace(caracter,'')) | |
| ig_tokens.append(i) | |
| i=i+1 | |
| return ( | |
| new_tokens, | |
| ig_tokens | |
| ) | |
| def reordenacion_identificadores(self,ig_tokens,predicted_tokens_classes, tamano): | |
| x=0 | |
| new_identificadores=[] | |
| for token in predicted_tokens_classes: | |
| if x not in ig_tokens: | |
| if len(new_identificadores) < tamano: | |
| new_identificadores.append(token) | |
| x=x+1 | |
| else: | |
| x=x+1 | |
| return new_identificadores | |
| def is_integer_string(self,value): | |
| try: | |
| int(value) | |
| return True | |
| except ValueError: | |
| return False | |
| def get_day_of(self, month_name, year=2024): | |
| months = { | |
| 'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6, | |
| 'julio': 7, 'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12, | |
| 'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6, | |
| 'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12 | |
| } | |
| month = months[month_name] | |
| _, num_days = calendar.monthrange(year, month) | |
| return str(num_days) | |
| ### | |
| ### Funciones para generar diversos datos fake dependiendo de la catagoria | |
| ### | |
| def fake_pers(self): | |
| return self.faker_.name(self) | |
| def fake_word(self): | |
| return self.faker_.word() | |
| def fake_first_name(self): | |
| return self.faker_.first_name() | |
| def fake_last_name(self): | |
| return self.faker_.last_name() | |
| def fake_address(self): | |
| return self.faker_.address() | |
| def fake_sentence(self,n): | |
| return self.faker_.sentence(nb_words=n) | |
| def fake_text(self): | |
| return self.faker_.text() | |
| def fake_company(self): | |
| return self.faker_.company() | |
| def fake_city(self): | |
| return self.faker_.city() | |
| def reemplazo_fake(self,identificadores,new_tokens): | |
| a=['Enero','January', 'February','Febrero','Marzo','March','Abril','April','Mayo','May','Junio','June','Julio','July','Agosto','August','Septiembre','September','Octubre','October','Noviembre','November','Diciembre','December'] | |
| b=['Ene','Jan', 'Feb','Mar','Mar','Abr','Apr','May','May','Jun','Jun','Jul','Jul','Ago','Aug','Sep','Oct','Nov','Dic','Dec'] | |
| i=0 | |
| if self.idioma=='es': | |
| self.faker_ = Faker('es_MX') | |
| else: | |
| self.faker_ = Faker('en_US') | |
| new_iden=[] | |
| for id in identificadores: | |
| if 'PER' in id: | |
| new_iden.append(self.fake_first_name()) | |
| elif 'ORG' in id: | |
| new_iden.append(self.fake_company()) | |
| elif 'LOC' in id: | |
| new_iden.append(self.fake_city()) | |
| elif 'DATE' in id: | |
| if self.is_integer_string(new_tokens[i]): | |
| match len(new_tokens[i]): | |
| case 4: | |
| new_iden.append(self.faker_.date()[:4]) | |
| case 10: | |
| new_iden.append(self.faker_.date()) | |
| case 1: | |
| new_iden.append(self.get_day_of('february')) | |
| case 2: | |
| new_iden.append(self.get_day_of('february')) | |
| case _: | |
| new_iden.append(id) | |
| else: | |
| match new_tokens[i]: | |
| case w if w in a: | |
| new_iden.append(self.faker_.month_name()) | |
| case w if w in b: | |
| new_iden.append(self.faker_.month_name()[:3]) | |
| case "-": | |
| new_iden.append("-") | |
| case ".": | |
| new_iden.append(".") | |
| case ",": | |
| new_iden.append(",") | |
| case "/": | |
| new_iden.append("/") | |
| case _: | |
| new_iden.append(id) | |
| else: | |
| new_iden.append(id) | |
| i=i+1 | |
| return new_iden | |
| ### | |
| ### Función que aplica los modelos de acuerdo al idioma detectado | |
| ### | |
| def aplicar_modelo(self,_sentences,idioma, etiquetas): | |
| if idioma=="es": | |
| self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization") | |
| tokenized_text=[self.tokenizer.tokenize(sentence[:500]) for sentence in _sentences] | |
| ids = [self.tokenizer.convert_tokens_to_ids(x) for x in tokenized_text] | |
| MAX_LEN=128 | |
| ids=pad_sequences(ids,maxlen=MAX_LEN,dtype="long",truncating="post", padding="post") | |
| input_ids = torch.tensor(ids) | |
| self.model = RobertaForTokenClassification.from_pretrained("BSC-LT/roberta_model_for_anonimization") | |
| with torch.no_grad(): | |
| logits = self.model(input_ids).logits | |
| predicted_token_class_ids = logits.argmax(-1) | |
| i=0 | |
| _predicted_tokens_classes=[] | |
| for a in predicted_token_class_ids: | |
| _predicted_tokens_classes.append([self.model.config.id2label[t.item()] for t in predicted_token_class_ids[i]]) | |
| i=i+1 | |
| labels = predicted_token_class_ids | |
| loss = self.model(input_ids, labels=labels).loss | |
| new_tokens=[] | |
| ig_tok=[] | |
| i=0 | |
| new_identificadores=[] | |
| for item in tokenized_text: | |
| aux1, aux2= self.reordenacion_tokens_es(item,"Ġ") | |
| new_tokens.append(aux1) | |
| ig_tok.append(aux2) | |
| for items in _predicted_tokens_classes: | |
| aux=self.reordenacion_identificadores(ig_tok[i],items,len(new_tokens[i])) | |
| new_identificadores.append(aux) | |
| i=i+1 | |
| return new_identificadores, new_tokens | |
| else: | |
| print('idioma:',idioma) | |
| self.tokenizer = AutoTokenizer.from_pretrained("dayannex/distilbert-tuned-4labels") | |
| self.model = AutoModelForTokenClassification.from_pretrained("dayannex/distilbert-tuned-4labels") | |
| sentences_list = _sentences.tolist() | |
| inputs = self.tokenizer(sentences_list, padding=True, truncation=True, return_tensors="pt", max_length=512) | |
| with torch.no_grad(): | |
| outputs = self.model(**inputs) | |
| logits = outputs.logits | |
| predictions = torch.argmax(logits, dim=2) | |
| id2label = self.model.config.id2label | |
| all_tokens = [] | |
| all_label_ids = [] | |
| all_labels = [] | |
| for i, sentence in enumerate(sentences_list): | |
| tokens = self.tokenizer.convert_ids_to_tokens(inputs.input_ids[i]) | |
| label_ids = predictions[i].tolist() | |
| labels = [id2label[label_id] for label_id in label_ids] | |
| all_tokens.append(tokens) | |
| all_label_ids.append(label_ids) | |
| all_labels.append(labels) | |
| #se eliminan el primer y ultimo elemento | |
| for item in all_tokens: | |
| item.pop(0) | |
| item.pop(len(item)-1) | |
| for item in all_labels: | |
| item.pop(0) | |
| item.pop(len(item)-1) | |
| new_tokens=[] | |
| ig_tok=[] | |
| i=0 | |
| new_identificadores=[] | |
| for item in all_tokens: | |
| aux1, aux2= self.reordenacion_tokens(item,"#") | |
| new_tokens.append(aux1) | |
| ig_tok.append(aux2) | |
| print('ig_tok') | |
| print(ig_tok) | |
| i=0 | |
| for items in all_labels: | |
| aux=self.reordenacion_identificadores(ig_tok[i],items,len(new_tokens[i])) | |
| new_identificadores.append(aux) | |
| i=i+1 | |
| special_tokens = self.tokenizer.all_special_tokens | |
| filtered_tokens = [] | |
| filtered_labels = [] | |
| tok_new=[] | |
| lab_new=[] | |
| #se descartan los tokens speciales | |
| for token_linea, label_linea in zip(new_tokens, new_identificadores): | |
| filtered_tokens = [] | |
| filtered_labels = [] | |
| for token, label in zip(token_linea, label_linea): | |
| if token not in special_tokens: | |
| filtered_tokens.append(token) | |
| filtered_labels.append(label) | |
| tok_new.append(filtered_tokens) | |
| lab_new.append(filtered_labels) | |
| return lab_new,tok_new #new_identificadores, new_tokens | |
| ### | |
| ### Procesa los tokens generados del texto de entradas con los tokens predichos, para generar los tokens por palabra | |
| ### | |
| def salida_texto( self,tokens,pre_tokens): | |
| new_labels = [] | |
| current_word = None | |
| i=0 | |
| for token in tokens: | |
| if pre_tokens[i]=='O' or 'MISC' in pre_tokens[i]: | |
| new_labels.append(' ' +token.replace('▁','').replace('Ġ','')) | |
| else: | |
| new_labels.append(' ' + pre_tokens[i]) | |
| i=i+1 | |
| a='' | |
| for i in new_labels: | |
| a = a+i | |
| return a | |
| def salida_texto2(self, tokens,labels,etiquetas): | |
| i=0 | |
| out=[] | |
| for iden in labels: | |
| if etiquetas: | |
| out.append(self.salida_texto( iden,np.array(tokens[i]))) | |
| else: | |
| out.append(self.salida_texto(iden,self.reemplazo_fake(np.array(tokens[i]),labels[i]))) | |
| i=i+1 | |
| return out | |
| def unir_array(self,_out): | |
| i=0 | |
| salida=[] | |
| for item in _out: | |
| salida.append("".join(str(x) for x in _out[i])) | |
| i=i+1 | |
| return salida | |
| def unir_columna_valores(self,df,columna): | |
| out = ','.join(df[columna]) | |
| return out | |
| ### | |
| ### Funcion para procesar archivos json, recibe archivo | |
| ### | |
| class utilJSON: | |
| def __init__(self,archivo): | |
| with open(archivo, encoding='utf-8') as f: | |
| self.data = json.load(f) | |
| def obtener_keys_json(self,data): | |
| out=[] | |
| for key in data: | |
| out.append(key) | |
| return(out) | |
| ### | |
| ### funcion "flatten_json" tomada de https://levelup.gitconnected.com/a-deep-dive-into-nested-json-to-data-frame-with-python-69bdabb41938 | |
| ### Renu Khandelwal Jul 23, 2023 | |
| def flatten_json(self,y): | |
| try: | |
| out = {} | |
| def flatten(x, name=''): | |
| if type(x) is dict: | |
| for a in x: | |
| flatten(x[a], name + a + '_') | |
| elif type(x) is list: | |
| i = 0 | |
| for a in x: | |
| flatten(a, name + str(i) + '_') | |
| i += 1 | |
| else: | |
| out[name[:-1]] = x | |
| flatten(y) | |
| return out | |
| except json.JSONDecodeError: | |
| print("Error: The JSON document could not be decoded.") | |
| except TypeError: | |
| print("Error: Invalid operation or function argument type.") | |
| except KeyError: | |
| print("Error: One or more keys do not exist.") | |
| except ValueError: | |
| print("Error: Invalid value detected.") | |
| except Exception as e: | |
| print(f"An unexpected error occurred: {str(e)}") | |
| def obtener_dataframe(self,data): | |
| claves=self.obtener_keys_json(data) | |
| if len(claves)==1: | |
| data_flattened = [self.flatten_json(class_info) for class_info in data[claves[0]]] | |
| df = pd.DataFrame(data_flattened) | |
| else: | |
| data_flattened = [self.flatten_json(class_info) for class_info in data] | |
| df = pd.DataFrame(data_flattened) | |
| return df | |
| modelo = ModeloDataset() | |
| model = Model() | |
| def get_model(): | |
| return model | |
| ### | |
| ### Función que interactúa con la interfaz Gradio para el procesamiento de texto, csv o json | |
| ### | |
| def procesar(texto,archivo, etiquetas): | |
| if len(texto)>0: | |
| print('text') | |
| model.identificacion_idioma(texto[:1700]) | |
| labels, textoProcesado, labels_fake, coincidencia= model.predict(etiquetas) | |
| return model.idioma + "/" + model.categoria_texto,labels, textoProcesado,gr.Dataframe(),gr.File(),labels_fake, coincidencia | |
| else: | |
| if archivo.name.split(".")[1]=="csv": | |
| print('csv') | |
| #df=pd.read_csv(archivo.name,delimiter=";",encoding='latin-1') | |
| df=pd.read_csv(archivo.name,delimiter=";") | |
| df_new = pd.DataFrame( columns=df.columns.values) | |
| model.identificacion_idioma(df.iloc[0][0]) | |
| modelo.idioma=model.idioma | |
| print(model.idioma) | |
| for item in df.columns.values: | |
| sentences=df[item] | |
| ides, predicted = modelo.aplicar_modelo(sentences,model.idioma,etiquetas) | |
| out=modelo.salida_texto2( ides,predicted,etiquetas) | |
| print('out csv:',out) | |
| df_new[item] = modelo.unir_array(out) | |
| return modelo.idioma,"","", df_new, df_new.to_csv(sep='\t', encoding='utf-8',index=False),"","" | |
| else: | |
| print('json') | |
| if archivo.name.split(".")[1]=="json": | |
| util = utilJSON(archivo.name) | |
| df=util.obtener_dataframe(util.data) | |
| df_new = pd.DataFrame( columns=df.columns.values) | |
| model.identificacion_idioma(df.iloc[0][0]) | |
| modelo.idioma=model.idioma | |
| for item in df.columns.values: | |
| sentences=df[item] | |
| print('sen') | |
| ides, predicted = modelo.aplicar_modelo(sentences,modelo.idioma,etiquetas) | |
| print('ap') | |
| out=modelo.salida_texto2( ides,predicted,etiquetas) | |
| print('sa') | |
| print('out json:',out) | |
| df_new[item] = modelo.unir_array(out) | |
| print('un') | |
| return modelo.idioma,"","", df_new, df_new.to_csv(sep='\t', encoding='utf-8',index=False),"","" | |
| demo = gr.Interface(fn=procesar,inputs=["text",gr.File(), "checkbox"] , outputs=[gr.Label(label="idioma/categoría"),gr.Textbox(label="etiquetas"),gr.Textbox(label="texto procesado"),gr.Dataframe(label="Datos procesados en dataframe",interactive=False),gr.Textbox(label="datos csv"),gr.Textbox(label="etiquetas anonimizadas"),gr.Label(label="coincidencia tokens originales vs anonimizados")]) | |
| # | |
| demo.launch(share=True) | |