Spaces:
Sleeping
Sleeping
| import traceback | |
| import logging | |
| from typing import List, Any, Optional, Tuple | |
| from obsei.payload import TextPayload | |
| from obsei.preprocessor.base_preprocessor import ( | |
| BaseTextPreprocessor, | |
| BaseTextProcessorConfig, | |
| ) | |
| from obsei.preprocessor.text_cleaning_function import TextCleaningFunction, ToLowerCase, RemoveWhiteSpaceAndEmptyToken, \ | |
| RemovePunctuation, RemoveSpecialChars, DecodeUnicode, RemoveDateTime, ReplaceDomainKeywords, TokenStemming, \ | |
| RemoveStopWords | |
| from obsei.preprocessor.text_tokenizer import BaseTextTokenizer, NLTKTextTokenizer | |
| cleaner_logger: logging.Logger = logging.getLogger(__name__) | |
| class TextCleanerConfig(BaseTextProcessorConfig): | |
| cleaning_functions: Optional[List[TextCleaningFunction]] = None | |
| stop_words_language: Optional[str] = "english" | |
| stop_words: Optional[List[str]] = None | |
| domain_keywords: Optional[Tuple[str, str]] = None | |
| disable_tokenization: bool = False | |
| def __init__(self, **data: Any): | |
| super().__init__(**data) | |
| if not self.cleaning_functions: | |
| self.cleaning_functions = [ | |
| ToLowerCase(), | |
| RemoveWhiteSpaceAndEmptyToken(), | |
| RemovePunctuation(), | |
| RemoveSpecialChars(), | |
| DecodeUnicode(), | |
| RemoveDateTime(), | |
| ReplaceDomainKeywords(domain_keywords=self.domain_keywords), | |
| TokenStemming(), | |
| RemoveStopWords( | |
| language=self.stop_words_language, stop_words=self.stop_words | |
| ), | |
| RemoveWhiteSpaceAndEmptyToken(), | |
| ] | |
| class TextCleaner(BaseTextPreprocessor): | |
| text_tokenizer: Optional[BaseTextTokenizer] = None | |
| def __init__(self, **data: Any): | |
| super().__init__(**data) | |
| self.text_tokenizer = self.text_tokenizer or NLTKTextTokenizer() | |
| def preprocess_input( # type: ignore[override] | |
| self, | |
| input_list: List[TextPayload], | |
| config: TextCleanerConfig, | |
| **kwargs: Any, | |
| ) -> List[TextPayload]: | |
| if config.cleaning_functions is None: | |
| return input_list | |
| for input_data in input_list: | |
| if self.text_tokenizer is None or config.disable_tokenization: | |
| tokens = [input_data.processed_text] | |
| else: | |
| tokens = self.text_tokenizer.tokenize_text( | |
| input_data.processed_text | |
| ) | |
| for cleaning_function in config.cleaning_functions: | |
| try: | |
| tokens = cleaning_function.execute(tokens) | |
| except Exception as ex: | |
| cleaner_logger.warning(f"Received exception: {ex}") | |
| traceback.print_exc() | |
| input_data.processed_text = " ".join(tokens) | |
| return input_list | |