dayannex commited on
Commit
8f4afb4
·
1 Parent(s): e66b5e6

dataset model csv ingles

Browse files
Files changed (1) hide show
  1. app.py +53 -42
app.py CHANGED
@@ -477,56 +477,67 @@ class ModeloDataset:
477
  print('idioma:',idioma)
478
  self.tokenizer = AutoTokenizer.from_pretrained("dayannex/distilbert-tuned-4labels")
479
  self.model = AutoModelForTokenClassification.from_pretrained("dayannex/distilbert-tuned-4labels")
480
- #tokenized_text=[self.tokenizer.tokenize(sentence[:500]) for sentence in _sentences]
481
- inputs=[self.tokenizer(sentence[:500], return_tensors="pt") for sentence in _sentences]
482
- print('inputs',inputs)
483
- #ids = [self.tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]
484
-
485
 
486
- #MAX_LEN=128
487
- #ids=pad_sequences(ids,maxlen=MAX_LEN,dtype="long",truncating="post", padding="post")
488
- #input_ids = torch.tensor(ids)
489
-
490
  with torch.no_grad():
491
- outputs = self.model(inputs)
492
- logits = outputs.logits
493
- predicted_token_class_ids = torch.argmax(logits, dim=2)
 
 
 
 
 
 
 
 
 
 
494
 
495
- #predicted_token_class_ids = predicted_token_class_ids[0].tolist()
496
- i=0
497
- _predicted_tokens_classes=[]
498
- for a in predicted_token_class_ids:
499
-
500
- _predicted_tokens_classes.append( [self.model.config.id2label[label_id] for label_id in predicted_token_class_ids[i]])
501
- i=i+1
502
- print('_predicted_tokens_classes:',_predicted_tokens_classes[0])
503
- #with torch.no_grad():
504
- # logits = self.model(input_ids).logits
505
- #predicted_token_class_ids = logits.argmax(-1)
506
- #i=0
507
- #_predicted_tokens_classes=[]
508
- #for a in predicted_token_class_ids:
509
-
510
- # _predicted_tokens_classes.append([self.model.config.id2label[t.item()] for t in predicted_token_class_ids[i]])
511
- # i=i+1
512
- #labels = predicted_token_class_ids
513
- #loss = self.model(input_ids, labels=labels).loss
514
 
515
  new_tokens=[]
516
  ig_tok=[]
517
  i=0
518
  new_identificadores=[]
519
- for item in tokenized_text:
520
-
521
- aux1, aux2= self.reordenacion_tokens(item,"#")
522
- new_tokens.append(aux1)
523
- ig_tok.append(aux2)
524
-
525
-
526
- for items in _predicted_tokens_classes:
527
- aux=self.reordenacion_identificadores(ig_tok[i],items,len(new_tokens[i]))
528
- new_identificadores.append(aux)
529
- i=i+1
 
 
 
 
 
 
 
 
 
530
 
531
 
532
  return new_identificadores, new_tokens
 
477
  print('idioma:',idioma)
478
  self.tokenizer = AutoTokenizer.from_pretrained("dayannex/distilbert-tuned-4labels")
479
  self.model = AutoModelForTokenClassification.from_pretrained("dayannex/distilbert-tuned-4labels")
480
+
481
+ inputs = self.tokenizer(_sentences, padding=True, truncation=True, return_tensors="pt", max_length=512)
482
+ #model.eval()
 
 
483
 
 
 
 
 
484
  with torch.no_grad():
485
+ outputs = model(**inputs)
486
+
487
+
488
+ logits = outputs.logits
489
+ predictions = torch.argmax(logits, dim=2)
490
+
491
+ id2label = model.config.id2label
492
+
493
+
494
+ all_tokens = []
495
+ all_label_ids = []
496
+ all_labels = []
497
+ for i, sentence in enumerate(_sentences):
498
 
499
+ tokens = self.tokenizer.convert_ids_to_tokens(inputs.input_ids[i])
500
+ label_ids = predictions[i].tolist()
501
+ labels = [id2label[label_id] for label_id in label_ids]
502
+
503
+
504
+ all_tokens.append(tokens)
505
+ all_label_ids.append(label_ids)
506
+ all_labels.append(labels)
507
+
508
+ #se eliminan el primer y ultimo elemento
509
+ for item in all_tokens:
510
+ item.pop(0)
511
+ item.pop(len(item)-1)
512
+ for item in all_labels:
513
+ item.pop(0)
514
+ item.pop(len(item)-1)
515
+
 
 
516
 
517
  new_tokens=[]
518
  ig_tok=[]
519
  i=0
520
  new_identificadores=[]
521
+ for item in all_tokens:
522
+ aux1, aux2= self.reordenacion_tokens(item,"#")
523
+ new_tokens.append(aux1)
524
+ ig_tok.append(aux2)
525
+
526
+ print('ig_tok')
527
+ print(ig_tok)
528
+ #print('all_tokens')
529
+ #print(all_tokens)
530
+ i=0
531
+ for items in all_labels:
532
+ aux=self.reordenacion_identificadores(ig_tok[i],items,len(new_tokens[i]))
533
+ new_identificadores.append(aux)
534
+ i=i+1
535
+ print('new_tokens')
536
+ print(new_tokens[1])
537
+ print(all_tokens[1])
538
+
539
+ print(len(new_tokens[1]))
540
+ print(len(new_identificadores[1]))
541
 
542
 
543
  return new_identificadores, new_tokens