Spaces:
Sleeping
Sleeping
dataset model csv ingles
Browse files
app.py
CHANGED
|
@@ -477,56 +477,67 @@ class ModeloDataset:
|
|
| 477 |
print('idioma:',idioma)
|
| 478 |
self.tokenizer = AutoTokenizer.from_pretrained("dayannex/distilbert-tuned-4labels")
|
| 479 |
self.model = AutoModelForTokenClassification.from_pretrained("dayannex/distilbert-tuned-4labels")
|
| 480 |
-
|
| 481 |
-
inputs=
|
| 482 |
-
|
| 483 |
-
#ids = [self.tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]
|
| 484 |
-
|
| 485 |
|
| 486 |
-
#MAX_LEN=128
|
| 487 |
-
#ids=pad_sequences(ids,maxlen=MAX_LEN,dtype="long",truncating="post", padding="post")
|
| 488 |
-
#input_ids = torch.tensor(ids)
|
| 489 |
-
|
| 490 |
with torch.no_grad():
|
| 491 |
-
outputs =
|
| 492 |
-
|
| 493 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
#
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
#labels = predicted_token_class_ids
|
| 513 |
-
#loss = self.model(input_ids, labels=labels).loss
|
| 514 |
|
| 515 |
new_tokens=[]
|
| 516 |
ig_tok=[]
|
| 517 |
i=0
|
| 518 |
new_identificadores=[]
|
| 519 |
-
for item in
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 530 |
|
| 531 |
|
| 532 |
return new_identificadores, new_tokens
|
|
|
|
| 477 |
print('idioma:',idioma)
|
| 478 |
self.tokenizer = AutoTokenizer.from_pretrained("dayannex/distilbert-tuned-4labels")
|
| 479 |
self.model = AutoModelForTokenClassification.from_pretrained("dayannex/distilbert-tuned-4labels")
|
| 480 |
+
|
| 481 |
+
inputs = self.tokenizer(_sentences, padding=True, truncation=True, return_tensors="pt", max_length=512)
|
| 482 |
+
#model.eval()
|
|
|
|
|
|
|
| 483 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
with torch.no_grad():
|
| 485 |
+
outputs = model(**inputs)
|
| 486 |
+
|
| 487 |
+
|
| 488 |
+
logits = outputs.logits
|
| 489 |
+
predictions = torch.argmax(logits, dim=2)
|
| 490 |
+
|
| 491 |
+
id2label = model.config.id2label
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
all_tokens = []
|
| 495 |
+
all_label_ids = []
|
| 496 |
+
all_labels = []
|
| 497 |
+
for i, sentence in enumerate(_sentences):
|
| 498 |
|
| 499 |
+
tokens = self.tokenizer.convert_ids_to_tokens(inputs.input_ids[i])
|
| 500 |
+
label_ids = predictions[i].tolist()
|
| 501 |
+
labels = [id2label[label_id] for label_id in label_ids]
|
| 502 |
+
|
| 503 |
+
|
| 504 |
+
all_tokens.append(tokens)
|
| 505 |
+
all_label_ids.append(label_ids)
|
| 506 |
+
all_labels.append(labels)
|
| 507 |
+
|
| 508 |
+
#se eliminan el primer y ultimo elemento
|
| 509 |
+
for item in all_tokens:
|
| 510 |
+
item.pop(0)
|
| 511 |
+
item.pop(len(item)-1)
|
| 512 |
+
for item in all_labels:
|
| 513 |
+
item.pop(0)
|
| 514 |
+
item.pop(len(item)-1)
|
| 515 |
+
|
|
|
|
|
|
|
| 516 |
|
| 517 |
new_tokens=[]
|
| 518 |
ig_tok=[]
|
| 519 |
i=0
|
| 520 |
new_identificadores=[]
|
| 521 |
+
for item in all_tokens:
|
| 522 |
+
aux1, aux2= self.reordenacion_tokens(item,"#")
|
| 523 |
+
new_tokens.append(aux1)
|
| 524 |
+
ig_tok.append(aux2)
|
| 525 |
+
|
| 526 |
+
print('ig_tok')
|
| 527 |
+
print(ig_tok)
|
| 528 |
+
#print('all_tokens')
|
| 529 |
+
#print(all_tokens)
|
| 530 |
+
i=0
|
| 531 |
+
for items in all_labels:
|
| 532 |
+
aux=self.reordenacion_identificadores(ig_tok[i],items,len(new_tokens[i]))
|
| 533 |
+
new_identificadores.append(aux)
|
| 534 |
+
i=i+1
|
| 535 |
+
print('new_tokens')
|
| 536 |
+
print(new_tokens[1])
|
| 537 |
+
print(all_tokens[1])
|
| 538 |
+
|
| 539 |
+
print(len(new_tokens[1]))
|
| 540 |
+
print(len(new_identificadores[1]))
|
| 541 |
|
| 542 |
|
| 543 |
return new_identificadores, new_tokens
|