# ai_hunter_enhanced.py # Combined AI Hunter configuration GUI and detection logic import json import os import re import unicodedata from difflib import SequenceMatcher from collections import Counter # PySide6 imports - optional for non-GUI usage try: from PySide6.QtWidgets import ( QDialog, QVBoxLayout, QHBoxLayout, QLabel, QPushButton, QCheckBox, QTabWidget, QWidget, QScrollArea, QFrame, QSlider, QSpinBox, QDoubleSpinBox, QRadioButton, QComboBox, QGroupBox, QMessageBox ) from PySide6.QtCore import Qt from PySide6.QtGui import QIcon HAS_GUI = True except ImportError: HAS_GUI = False # Dummy classes for non-GUI usage QDialog = object QWidget = object class AIHunterConfigGUI: """GUI for configuring AI Hunter detection parameters""" def __init__(self, parent, config_dict, callback=None): """ Initialize with reference to main config dictionary Args: parent: Parent window config_dict: Reference to main translator config dictionary callback: Function to call after saving """ self.parent = parent self.config = config_dict # Reference to main config self.callback = callback self.window = None # Default AI Hunter settings structure self.default_ai_hunter = { 'enabled': True, 'ai_hunter_max_workers': 1, 'retry_attempts': 6, 'disable_temperature_change': False, 'sample_size': 3000, 'thresholds': { 'exact': 90, 'text': 35, 'semantic': 85, 'structural': 85, 'character': 90, 'pattern': 80 }, 'weights': { 'exact': 1.5, 'text': 1.2, 'semantic': 1.0, 'structural': 1.0, 'character': 0.8, 'pattern': 0.8 }, 'detection_mode': 'weighted_average', 'multi_method_requirements': { 'methods_required': 3, 'min_methods': ['semantic', 'structural'] }, 'preprocessing': { 'remove_html_spacing': True, 'normalize_unicode': True, 'ignore_case': True, 'remove_extra_whitespace': True }, 'edge_filters': { 'min_text_length': 500, 'max_length_ratio': 1.3, 'min_length_ratio': 0.7 }, 'language_detection': { 'enabled': False, 'target_language': 'english', 'threshold_characters': 500, 'languages': { 'english': ['en'], 'japanese': ['ja', 'jp'], 'korean': ['ko', 'kr'], 'chinese': ['zh', 'zh-cn', 'zh-tw'], 'spanish': ['es'], 'french': ['fr'], 'german': ['de'], 'russian': ['ru'], 'arabic': ['ar'], 'hindi': ['hi'], 'portuguese': ['pt'], 'italian': ['it'], 'dutch': ['nl'], 'thai': ['th'], 'vietnamese': ['vi'], 'turkish': ['tr'], 'polish': ['pl'], 'swedish': ['sv'], 'danish': ['da'], 'norwegian': ['no'], 'finnish': ['fi'] } } } # Initialize AI Hunter config in main config if not present if 'ai_hunter_config' not in self.config: self.config['ai_hunter_config'] = self.default_ai_hunter.copy() else: # Merge with defaults to ensure all keys exist self.config['ai_hunter_config'] = self._merge_configs( self.default_ai_hunter, self.config['ai_hunter_config'] ) def _merge_configs(self, default, existing): """Recursively merge existing config with defaults""" result = default.copy() for key, value in existing.items(): if key in result and isinstance(result[key], dict) and isinstance(value, dict): result[key] = self._merge_configs(result[key], value) else: result[key] = value return result def get_ai_config(self): """Get AI Hunter configuration from main config""" return self.config.get('ai_hunter_config', self.default_ai_hunter) def _disable_mousewheel(self, widget): """Disable mousewheel scrolling on a widget (PySide6)""" widget.wheelEvent = lambda event: None def _create_styled_checkbox(self, text): """Create a checkbox with proper checkmark using text overlay""" from PySide6.QtCore import QTimer checkbox = QCheckBox(text) # Don't set inline stylesheet - use the global stylesheet from container # Create checkmark overlay checkmark = QLabel("✓", checkbox) checkmark.setStyleSheet(""" QLabel { color: white; background: transparent; font-weight: bold; font-size: 11px; } """) checkmark.setAlignment(Qt.AlignCenter) checkmark.hide() checkmark.setAttribute(Qt.WA_TransparentForMouseEvents) def position_checkmark(): try: # Check if checkmark still exists and is valid if checkmark and not checkmark.isHidden() or True: # Always try to set geometry checkmark.setGeometry(2, 1, 14, 14) except RuntimeError: # Widget was already deleted pass def update_checkmark(): try: # Check if both widgets still exist if checkbox and checkmark: if checkbox.isChecked(): position_checkmark() checkmark.show() else: checkmark.hide() except RuntimeError: # Widget was already deleted pass checkbox.stateChanged.connect(update_checkmark) # Use try-except to handle case where widgets are deleted before timer fires def safe_init(): try: position_checkmark() update_checkmark() except RuntimeError: pass QTimer.singleShot(0, safe_init) return checkbox def show_ai_hunter_config(self): """Display the AI Hunter configuration window (PySide6)""" try: if self.window and not self.window.isHidden(): self.window.raise_() self.window.activateWindow() return except RuntimeError: # Window was deleted self.window = None # Create dialog dialog = QDialog(None) dialog.setWindowTitle("AI Hunter Configuration") # Use screen ratios for sizing (more reliable across different displays) from PySide6.QtWidgets import QApplication screen = QApplication.primaryScreen().geometry() width = int(screen.width() * 0.47) # 47% of screen width height = int(screen.height() * 0.69) # 69% of screen height dialog.resize(width, height) # Set icon try: dialog.setWindowIcon(QIcon("halgakos.ico")) except Exception: pass self.window = dialog # Apply global stylesheet for checkboxes, radio buttons, and tabs checkbox_radio_style = """ QCheckBox { color: white; spacing: 6px; } QCheckBox::indicator { width: 14px; height: 14px; border: 1px solid #5a9fd4; border-radius: 2px; background-color: #2d2d2d; } QCheckBox::indicator:checked { background-color: #5a9fd4; border-color: #5a9fd4; } QCheckBox::indicator:hover { border-color: #7bb3e0; } QCheckBox:disabled { color: #666666; } QCheckBox::indicator:disabled { background-color: #1a1a1a; border-color: #3a3a3a; } QRadioButton { color: white; spacing: 5px; } QRadioButton::indicator { width: 13px; height: 13px; border: 2px solid #5a9fd4; border-radius: 7px; background-color: #2d2d2d; } QRadioButton::indicator:checked { background-color: #5a9fd4; border: 2px solid #5a9fd4; } QRadioButton::indicator:hover { border-color: #7bb3e0; } QRadioButton:disabled { color: #666666; } QRadioButton::indicator:disabled { background-color: #1a1a1a; border-color: #3a3a3a; } QTabWidget::pane { border: 1px solid #5a9fd4; background-color: #2d2d2d; border-radius: 3px; } QTabBar::tab { background-color: #1a1a1a; color: #aaaaaa; padding: 8px 16px; margin-right: 2px; border: 1px solid #3a3a3a; border-bottom: none; border-top-left-radius: 4px; border-top-right-radius: 4px; min-width: 100px; } QTabBar::tab:selected { background-color: #5a9fd4; color: white; font-weight: bold; border: 1px solid #5a9fd4; border-bottom: none; } QTabBar::tab:hover { background-color: #3a3a3a; color: white; } QTabBar::tab:selected:hover { background-color: #7bb3e0; } """ main_layout = QVBoxLayout(dialog) main_layout.setContentsMargins(10, 10, 10, 10) # Create tab widget tabs = QTabWidget() tabs.setStyleSheet(checkbox_radio_style) main_layout.addWidget(tabs) # Tab 1: Detection Thresholds self.create_thresholds_tab(tabs) # Tab 2: Detection Mode self.create_mode_tab(tabs) # Tab 3: Preprocessing self.create_preprocessing_tab(tabs) # Tab 4: Advanced Settings self.create_advanced_tab(tabs) # Buttons at the bottom button_layout = QHBoxLayout() button_layout.setContentsMargins(5, 10, 5, 10) reset_btn = QPushButton("⚠️ Reset to Defaults") reset_btn.clicked.connect(self.reset_defaults) reset_btn.setMinimumHeight(35) reset_btn.setStyleSheet( "QPushButton { " " background-color: #ffc107; " " color: black; " " padding: 8px 20px; " " font-size: 11pt; " " font-weight: bold; " " border-radius: 4px; " "} " "QPushButton:hover { background-color: #e0a800; }" ) button_layout.addWidget(reset_btn) button_layout.addStretch() save_btn = QPushButton("💾 Save") save_btn.clicked.connect(self.apply_ai_hunter_settings) save_btn.setMinimumHeight(35) save_btn.setStyleSheet( "QPushButton { " " background-color: #28a745; " " color: white; " " padding: 8px 20px; " " font-size: 11pt; " " font-weight: bold; " " border-radius: 4px; " "} " "QPushButton:hover { background-color: #218838; }" ) button_layout.addWidget(save_btn) cancel_btn = QPushButton("❌ Cancel") cancel_btn.clicked.connect(dialog.close) cancel_btn.setMinimumHeight(35) cancel_btn.setStyleSheet( "QPushButton { " " background-color: #6c757d; " " color: white; " " padding: 8px 20px; " " font-size: 11pt; " " font-weight: bold; " " border-radius: 4px; " "} " "QPushButton:hover { background-color: #5a6268; }" ) button_layout.addWidget(cancel_btn) main_layout.addLayout(button_layout) dialog.show() def create_thresholds_tab(self, tabs): """Create the thresholds configuration tab (PySide6)""" scroll = QScrollArea() scroll.setWidgetResizable(True) scroll.setHorizontalScrollBarPolicy(Qt.ScrollBarAlwaysOff) frame = QWidget() layout = QVBoxLayout(frame) layout.setContentsMargins(20, 20, 20, 20) layout.setSpacing(10) # Title title = QLabel("Detection Method Thresholds") title.setStyleSheet("font-size: 12pt; font-weight: bold;") layout.addWidget(title) desc = QLabel("Higher values = fewer false positives (more strict)\n" "Lower values = more false positives (more sensitive)") desc.setStyleSheet("color: gray; font-size: 10pt;") layout.addWidget(desc) layout.addSpacing(10) # Threshold controls self.threshold_vars = {} self.threshold_labels = {} descriptions = { 'exact': 'Exact Text Match - Direct character-by-character comparison', 'text': 'Smart Text Similarity - Intelligent text comparison with sampling', 'semantic': 'Semantic Analysis - Character names, dialogue patterns, numbers', 'structural': 'Structural Patterns - Paragraph structure, dialogue distribution', 'character': 'Character Overlap - Common character names between chapters', 'pattern': 'Pattern Analysis - Narrative flow and structure patterns' } ai_config = self.get_ai_config() for method, desc in descriptions.items(): method_frame = QWidget() method_layout = QVBoxLayout(method_frame) method_layout.setContentsMargins(0, 10, 0, 10) # Method name and description label_widget = QWidget() label_layout = QHBoxLayout(label_widget) label_layout.setContentsMargins(0, 0, 0, 0) method_label = QLabel(f"{method.title()}:") method_label.setStyleSheet("font-weight: bold; font-size: 10pt;") label_layout.addWidget(method_label) desc_label = QLabel(f" {desc}") desc_label.setStyleSheet("color: gray; font-size: 9pt;") label_layout.addWidget(desc_label) label_layout.addStretch() method_layout.addWidget(label_widget) # Slider and value slider_widget = QWidget() slider_layout = QHBoxLayout(slider_widget) slider_layout.setContentsMargins(20, 5, 0, 0) slider = QSlider(Qt.Horizontal) slider.setMinimum(10) slider.setMaximum(100) slider.setValue(ai_config['thresholds'][method]) slider.setFixedWidth(400) self._disable_mousewheel(slider) self.threshold_vars[method] = slider slider_layout.addWidget(slider) value_label = QLabel(f"{slider.value()}%") value_label.setFixedWidth(50) self.threshold_labels[method] = value_label slider_layout.addWidget(value_label) # Connect slider to label update slider.valueChanged.connect( lambda val, lbl=value_label: lbl.setText(f"{val}%") ) slider_layout.addStretch() method_layout.addWidget(slider_widget) layout.addWidget(method_frame) # Weight configuration layout.addSpacing(20) weight_title = QLabel("Method Weights (for weighted average mode)") weight_title.setStyleSheet("font-size: 11pt; font-weight: bold;") layout.addWidget(weight_title) layout.addSpacing(10) self.weight_vars = {} for method in descriptions.keys(): w_widget = QWidget() w_layout = QHBoxLayout(w_widget) w_layout.setContentsMargins(0, 5, 0, 5) w_label = QLabel(f"{method.title()} weight:") w_label.setFixedWidth(150) w_layout.addWidget(w_label) w_spinbox = QDoubleSpinBox() w_spinbox.setMinimum(0.1) w_spinbox.setMaximum(2.0) w_spinbox.setSingleStep(0.1) w_spinbox.setValue(ai_config['weights'][method]) w_spinbox.setFixedWidth(80) self._disable_mousewheel(w_spinbox) self.weight_vars[method] = w_spinbox w_layout.addWidget(w_spinbox) w_layout.addStretch() layout.addWidget(w_widget) layout.addStretch() scroll.setWidget(frame) tabs.addTab(scroll, "Detection Thresholds") def create_mode_tab(self, tabs): """Create the detection mode configuration tab (PySide6)""" scroll = QScrollArea() scroll.setWidgetResizable(True) scroll.setHorizontalScrollBarPolicy(Qt.ScrollBarAlwaysOff) frame = QWidget() layout = QVBoxLayout(frame) layout.setContentsMargins(20, 20, 20, 20) layout.setSpacing(10) title = QLabel("Detection Mode Configuration") title.setStyleSheet("font-size: 12pt; font-weight: bold;") layout.addWidget(title) layout.addSpacing(10) # Detection mode selection mode_box = QGroupBox("Detection Mode") mode_layout = QVBoxLayout(mode_box) mode_layout.setSpacing(10) ai_config = self.get_ai_config() self.mode_buttons = {} modes = [ ('single_method', 'Single Method', 'Flag as duplicate if ANY method exceeds its threshold\n(Most sensitive, most false positives)'), ('multi_method', 'Multi-Method Agreement', 'Require multiple methods to agree before flagging\n(Balanced approach)'), ('weighted_average', 'Weighted Average', 'Calculate weighted average of all methods\n(Most nuanced, least false positives)') ] for value, text, desc in modes: rb_widget = QWidget() rb_layout = QVBoxLayout(rb_widget) rb_layout.setContentsMargins(0, 10, 0, 10) rb = QRadioButton(text) if value == ai_config['detection_mode']: rb.setChecked(True) self.mode_buttons[value] = rb rb_layout.addWidget(rb) desc_label = QLabel(desc) desc_label.setStyleSheet("color: gray; font-size: 9pt;") desc_label.setContentsMargins(25, 0, 0, 0) rb_layout.addWidget(desc_label) mode_layout.addWidget(rb_widget) layout.addWidget(mode_box) # Multi-method configuration multi_box = QGroupBox("Multi-Method Settings") multi_layout = QVBoxLayout(multi_box) req_label = QLabel("Number of methods required to agree:") req_label.setStyleSheet("font-size: 10pt;") multi_layout.addWidget(req_label) self.methods_required_spinbox = QSpinBox() self.methods_required_spinbox.setMinimum(1) self.methods_required_spinbox.setMaximum(6) self.methods_required_spinbox.setValue( ai_config['multi_method_requirements']['methods_required']) self.methods_required_spinbox.setFixedWidth(80) self._disable_mousewheel(self.methods_required_spinbox) multi_layout.addWidget(self.methods_required_spinbox) multi_layout.addSpacing(10) min_label = QLabel("Required methods (at least one must be included):") min_label.setStyleSheet("font-size: 10pt;") multi_layout.addWidget(min_label) multi_layout.addSpacing(5) self.required_method_checkboxes = {} for method in ['exact', 'text', 'semantic', 'structural', 'character', 'pattern']: cb = self._create_styled_checkbox(method.title()) cb.setChecked(method in ai_config['multi_method_requirements']['min_methods']) cb.setContentsMargins(20, 0, 0, 0) self.required_method_checkboxes[method] = cb multi_layout.addWidget(cb) layout.addWidget(multi_box) layout.addStretch() scroll.setWidget(frame) tabs.addTab(scroll, "Detection Mode") def create_preprocessing_tab(self, tabs): """Create the preprocessing configuration tab (PySide6)""" scroll = QScrollArea() scroll.setWidgetResizable(True) scroll.setHorizontalScrollBarPolicy(Qt.ScrollBarAlwaysOff) frame = QWidget() layout = QVBoxLayout(frame) layout.setContentsMargins(20, 20, 20, 20) layout.setSpacing(10) title = QLabel("Text Preprocessing Options") title.setStyleSheet("font-size: 12pt; font-weight: bold;") layout.addWidget(title) desc = QLabel("Configure how text is processed before comparison") desc.setStyleSheet("color: gray; font-size: 10pt;") layout.addWidget(desc) layout.addSpacing(10) # Preprocessing options self.prep_checkboxes = {} ai_config = self.get_ai_config() options = [ ('remove_html_spacing', 'Remove HTML with spacing', 'Replace HTML tags with spaces instead of removing completely'), ('normalize_unicode', 'Normalize Unicode', 'Normalize unicode characters (recommended)'), ('ignore_case', 'Case-insensitive comparison', 'Ignore character case when comparing'), ('remove_extra_whitespace', 'Remove extra whitespace', 'Collapse multiple spaces/newlines into single spaces') ] for key, text, desc_text in options: opt_widget = QWidget() opt_layout = QVBoxLayout(opt_widget) opt_layout.setContentsMargins(0, 10, 0, 10) cb = self._create_styled_checkbox(text) cb.setChecked(ai_config['preprocessing'][key]) self.prep_checkboxes[key] = cb opt_layout.addWidget(cb) desc_label = QLabel(desc_text) desc_label.setStyleSheet("color: gray; font-size: 9pt;") desc_label.setContentsMargins(25, 0, 0, 0) opt_layout.addWidget(desc_label) layout.addWidget(opt_widget) layout.addStretch() scroll.setWidget(frame) tabs.addTab(scroll, "Preprocessing") def create_advanced_tab(self, tabs): """Create the advanced settings tab (PySide6)""" scroll = QScrollArea() scroll.setWidgetResizable(True) scroll.setHorizontalScrollBarPolicy(Qt.ScrollBarAlwaysOff) frame = QWidget() layout = QVBoxLayout(frame) layout.setContentsMargins(20, 20, 20, 20) layout.setSpacing(10) title = QLabel("Advanced Settings") title.setStyleSheet("font-size: 12pt; font-weight: bold;") layout.addWidget(title) layout.addSpacing(10) ai_config = self.get_ai_config() # General settings general_box = QGroupBox("General") general_layout = QVBoxLayout(general_box) # Sample size ss_widget = QWidget() ss_layout = QHBoxLayout(ss_widget) ss_layout.setContentsMargins(0, 0, 0, 0) ss_label = QLabel("Sample size:") ss_label.setFixedWidth(150) ss_layout.addWidget(ss_label) self.sample_size_spinbox = QSpinBox() self.sample_size_spinbox.setMinimum(1000) self.sample_size_spinbox.setMaximum(10000) self.sample_size_spinbox.setSingleStep(500) self.sample_size_spinbox.setValue(ai_config['sample_size']) self.sample_size_spinbox.setFixedWidth(100) self._disable_mousewheel(self.sample_size_spinbox) ss_layout.addWidget(self.sample_size_spinbox) ss_unit = QLabel("characters") ss_unit.setStyleSheet("color: gray; font-size: 9pt;") ss_layout.addWidget(ss_unit) ss_layout.addStretch() general_layout.addWidget(ss_widget) # AI Hunter Behavior Settings behavior_label = QLabel("AI Hunter Behavior") behavior_label.setStyleSheet("font-size: 10pt; font-weight: bold;") general_layout.addWidget(behavior_label) general_layout.addSpacing(5) # Retry Attempts retry_widget = QWidget() retry_layout = QHBoxLayout(retry_widget) retry_layout.setContentsMargins(0, 0, 0, 0) retry_label = QLabel("Retry attempts:") retry_label.setFixedWidth(150) retry_layout.addWidget(retry_label) self.retry_attempts_spinbox = QSpinBox() self.retry_attempts_spinbox.setMinimum(1) self.retry_attempts_spinbox.setMaximum(10) self.retry_attempts_spinbox.setValue(ai_config.get('retry_attempts', 3)) self.retry_attempts_spinbox.setFixedWidth(100) self._disable_mousewheel(self.retry_attempts_spinbox) retry_layout.addWidget(self.retry_attempts_spinbox) retry_unit = QLabel("attempts") retry_unit.setStyleSheet("color: gray; font-size: 9pt;") retry_layout.addWidget(retry_unit) retry_layout.addStretch() general_layout.addWidget(retry_widget) # Temperature Change Toggle temp_widget = QWidget() temp_layout = QVBoxLayout(temp_widget) temp_layout.setContentsMargins(0, 10, 0, 0) self.disable_temp_change_checkbox = self._create_styled_checkbox("Disable temperature change behavior") self.disable_temp_change_checkbox.setChecked(ai_config.get('disable_temperature_change', False)) temp_layout.addWidget(self.disable_temp_change_checkbox) temp_desc = QLabel("Prevents AI Hunter from modifying temperature settings during retries") temp_desc.setStyleSheet("color: gray; font-size: 9pt;") temp_desc.setContentsMargins(25, 0, 0, 0) temp_layout.addWidget(temp_desc) general_layout.addWidget(temp_widget) layout.addWidget(general_box) # Edge filters edge_box = QGroupBox("Edge Case Filters") edge_layout = QVBoxLayout(edge_box) # Min text length min_widget = QWidget() min_layout = QHBoxLayout(min_widget) min_layout.setContentsMargins(0, 0, 0, 0) min_label = QLabel("Minimum text length:") min_label.setFixedWidth(150) min_layout.addWidget(min_label) self.min_length_spinbox = QSpinBox() self.min_length_spinbox.setMinimum(100) self.min_length_spinbox.setMaximum(2000) self.min_length_spinbox.setSingleStep(100) self.min_length_spinbox.setValue(ai_config['edge_filters']['min_text_length']) self.min_length_spinbox.setFixedWidth(100) self._disable_mousewheel(self.min_length_spinbox) min_layout.addWidget(self.min_length_spinbox) min_unit = QLabel("characters") min_unit.setStyleSheet("color: gray; font-size: 9pt;") min_layout.addWidget(min_unit) min_layout.addStretch() edge_layout.addWidget(min_widget) # Length ratios ratio_title = QLabel("Length ratio limits:") edge_layout.addWidget(ratio_title) edge_layout.addSpacing(5) ratio_widget = QWidget() ratio_layout = QHBoxLayout(ratio_widget) ratio_layout.setContentsMargins(20, 0, 0, 0) min_ratio_label = QLabel("Min ratio:") min_ratio_label.setFixedWidth(80) ratio_layout.addWidget(min_ratio_label) self.min_ratio_spinbox = QDoubleSpinBox() self.min_ratio_spinbox.setMinimum(0.5) self.min_ratio_spinbox.setMaximum(0.9) self.min_ratio_spinbox.setSingleStep(0.1) self.min_ratio_spinbox.setValue(ai_config['edge_filters']['min_length_ratio']) self.min_ratio_spinbox.setFixedWidth(80) self._disable_mousewheel(self.min_ratio_spinbox) ratio_layout.addWidget(self.min_ratio_spinbox) max_ratio_label = QLabel("Max ratio:") max_ratio_label.setFixedWidth(80) ratio_layout.addWidget(max_ratio_label) self.max_ratio_spinbox = QDoubleSpinBox() self.max_ratio_spinbox.setMinimum(1.1) self.max_ratio_spinbox.setMaximum(2.0) self.max_ratio_spinbox.setSingleStep(0.1) self.max_ratio_spinbox.setValue(ai_config['edge_filters']['max_length_ratio']) self.max_ratio_spinbox.setFixedWidth(80) self._disable_mousewheel(self.max_ratio_spinbox) ratio_layout.addWidget(self.max_ratio_spinbox) ratio_layout.addStretch() edge_layout.addWidget(ratio_widget) ratio_desc = QLabel("Chapters with vastly different lengths won't be compared") ratio_desc.setStyleSheet("color: gray; font-size: 9pt;") ratio_desc.setContentsMargins(20, 5, 0, 0) edge_layout.addWidget(ratio_desc) layout.addWidget(edge_box) # Language Detection lang_box = QGroupBox("Non-Target Language Detection") lang_layout = QVBoxLayout(lang_box) # Enable toggle enable_widget = QWidget() enable_layout = QVBoxLayout(enable_widget) enable_layout.setContentsMargins(0, 0, 0, 0) self.lang_enabled_checkbox = self._create_styled_checkbox("Enable non-target language detection") self.lang_enabled_checkbox.setChecked(ai_config['language_detection']['enabled']) enable_layout.addWidget(self.lang_enabled_checkbox) enable_desc = QLabel("Trigger retranslation when too much non-target language is detected") enable_desc.setStyleSheet("color: gray; font-size: 9pt;") enable_desc.setContentsMargins(25, 0, 0, 0) enable_layout.addWidget(enable_desc) lang_layout.addWidget(enable_widget) lang_layout.addSpacing(10) # Target language selection target_widget = QWidget() target_layout = QHBoxLayout(target_widget) target_layout.setContentsMargins(0, 0, 0, 0) target_label = QLabel("Target language:") target_label.setFixedWidth(150) target_layout.addWidget(target_label) lang_options = list(ai_config['language_detection']['languages'].keys()) self.target_lang_combo = QComboBox() self.target_lang_combo.addItems(lang_options) # Prioritize main config's target language for sync main_target_lang = self.config.get('glossary_target_language') or self.config.get('output_language') if main_target_lang: # Find closest match if main_target_lang in lang_options: self.target_lang_combo.setCurrentText(main_target_lang) else: # Try case-insensitive found = False for opt in lang_options: if opt.lower() == main_target_lang.lower(): self.target_lang_combo.setCurrentText(opt) found = True break if not found: self.target_lang_combo.setCurrentText(ai_config['language_detection']['target_language']) else: self.target_lang_combo.setCurrentText(ai_config['language_detection']['target_language']) self.target_lang_combo.setFixedWidth(150) self._disable_mousewheel(self.target_lang_combo) target_layout.addWidget(self.target_lang_combo) target_desc = QLabel("Language that should be in the translation") target_desc.setStyleSheet("color: gray; font-size: 9pt;") target_layout.addWidget(target_desc) target_layout.addStretch() lang_layout.addWidget(target_widget) # Threshold setting thresh_widget = QWidget() thresh_layout = QHBoxLayout(thresh_widget) thresh_layout.setContentsMargins(0, 5, 0, 0) thresh_label = QLabel("Character threshold:") thresh_label.setFixedWidth(150) thresh_layout.addWidget(thresh_label) self.lang_threshold_spinbox = QSpinBox() self.lang_threshold_spinbox.setMinimum(100) self.lang_threshold_spinbox.setMaximum(2000) self.lang_threshold_spinbox.setSingleStep(50) self.lang_threshold_spinbox.setValue(ai_config['language_detection']['threshold_characters']) self.lang_threshold_spinbox.setFixedWidth(100) self._disable_mousewheel(self.lang_threshold_spinbox) thresh_layout.addWidget(self.lang_threshold_spinbox) thresh_desc = QLabel("non-target language characters to trigger retranslation") thresh_desc.setStyleSheet("color: gray; font-size: 9pt;") thresh_layout.addWidget(thresh_desc) thresh_layout.addStretch() lang_layout.addWidget(thresh_widget) layout.addWidget(lang_box) layout.addStretch() scroll.setWidget(frame) tabs.addTab(scroll, "Advanced") def apply_ai_hunter_settings(self): """Apply AI Hunter settings to the main config (PySide6)""" ai_config = self.get_ai_config() # Update from GUI variables for method, slider in self.threshold_vars.items(): ai_config['thresholds'][method] = slider.value() for method, spinbox in self.weight_vars.items(): ai_config['weights'][method] = spinbox.value() # Get selected detection mode for mode_value, radio_btn in self.mode_buttons.items(): if radio_btn.isChecked(): ai_config['detection_mode'] = mode_value break ai_config['multi_method_requirements']['methods_required'] = self.methods_required_spinbox.value() min_methods = [method for method, cb in self.required_method_checkboxes.items() if cb.isChecked()] ai_config['multi_method_requirements']['min_methods'] = min_methods for key, cb in self.prep_checkboxes.items(): ai_config['preprocessing'][key] = cb.isChecked() ai_config['sample_size'] = self.sample_size_spinbox.value() ai_config['edge_filters']['min_text_length'] = self.min_length_spinbox.value() ai_config['edge_filters']['min_length_ratio'] = self.min_ratio_spinbox.value() ai_config['edge_filters']['max_length_ratio'] = self.max_ratio_spinbox.value() # Language detection settings ai_config['language_detection']['enabled'] = self.lang_enabled_checkbox.isChecked() new_target_lang = self.target_lang_combo.currentText() ai_config['language_detection']['target_language'] = new_target_lang ai_config['language_detection']['threshold_characters'] = self.lang_threshold_spinbox.value() # Sync back to main config self.config['output_language'] = new_target_lang self.config['glossary_target_language'] = new_target_lang # Also update environment variable immediately os.environ['OUTPUT_LANGUAGE'] = new_target_lang os.environ['GLOSSARY_TARGET_LANGUAGE'] = new_target_lang # Update retry attempts and temperature change settings ai_config['retry_attempts'] = self.retry_attempts_spinbox.value() ai_config['disable_temperature_change'] = self.disable_temp_change_checkbox.isChecked() # Update main config self.config['ai_hunter_config'] = ai_config # Call callback if provided (this should trigger main save_configuration) # The callback (save_config) will show its own success message if self.callback: self.callback() self.window.close() def reset_defaults(self): """Reset all values to defaults (PySide6)""" msg_box = QMessageBox(self.window) msg_box.setWindowTitle("Reset to Defaults") msg_box.setText("Are you sure you want to reset all settings to defaults?") msg_box.setStandardButtons(QMessageBox.Yes | QMessageBox.No) msg_box.setDefaultButton(QMessageBox.No) msg_box.setIcon(QMessageBox.Question) try: msg_box.setWindowIcon(QIcon("halgakos.ico")) except Exception: pass reply = msg_box.exec() if reply == QMessageBox.Yes: self.config['ai_hunter_config'] = self.default_ai_hunter.copy() self.window.close() self.show_ai_hunter_config() # Reopen with default values class ImprovedAIHunterDetection: """Improved AI Hunter detection methods for TranslateKRtoEN""" def __init__(self, main_config): """ Initialize with reference to main config Args: main_config: Reference to main translator config dictionary """ self.main_config = main_config # Default AI Hunter settings self.default_ai_hunter = { 'enabled': True, 'lookback_chapters': 5, 'retry_attempts': 3, 'disable_temperature_change': False, 'sample_size': 3000, 'thresholds': { 'exact': 90, 'text': 85, 'semantic': 85, 'structural': 85, 'character': 80, 'pattern': 80 }, 'weights': { 'exact': 1.5, 'text': 1.2, 'semantic': 1.0, 'structural': 1.0, 'character': 0.8, 'pattern': 0.8 }, 'detection_mode': 'multi_method', 'multi_method_requirements': { 'methods_required': 2, 'min_methods': ['semantic', 'structural'] }, 'preprocessing': { 'remove_html_spacing': True, 'normalize_unicode': True, 'ignore_case': True, 'remove_extra_whitespace': True }, 'edge_filters': { 'min_text_length': 500, 'max_length_ratio': 1.3, 'min_length_ratio': 0.7 }, 'language_detection': { 'enabled': False, 'target_language': 'english', 'threshold_characters': 500, 'languages': { 'english': ['en'], 'japanese': ['ja', 'jp'], 'korean': ['ko', 'kr'], 'chinese': ['zh', 'zh-cn', 'zh-tw'], 'spanish': ['es'], 'french': ['fr'], 'german': ['de'], 'russian': ['ru'], 'arabic': ['ar'], 'hindi': ['hi'], 'portuguese': ['pt'], 'italian': ['it'], 'dutch': ['nl'], 'thai': ['th'], 'vietnamese': ['vi'], 'turkish': ['tr'], 'polish': ['pl'], 'swedish': ['sv'], 'danish': ['da'], 'norwegian': ['no'], 'finnish': ['fi'] } } } def get_ai_config(self): """Get AI Hunter configuration from main config""" return self.main_config.get('ai_hunter_config', self.default_ai_hunter) def detect_duplicate_ai_hunter_enhanced(self, result, idx, prog, out, current_chapter_num=None): """Enhanced AI Hunter duplicate detection with configurable parameters""" try: print(f"\n ========== AI HUNTER DEBUG START ==========") print(f" 📍 Current chapter index: {idx}") if current_chapter_num: print(f" 📖 Current chapter number: {current_chapter_num}") # Get configuration config = self.get_ai_config() if not config.get('enabled', True): print(f" ⚠️ AI Hunter is disabled") print(f" ========== AI HUNTER DEBUG END ==========\n") return False, 0 # Preprocess text result_clean = self._preprocess_text(result, config['preprocessing']) print(f" 📄 Text length after preprocessing: {len(result_clean)} chars") # Check for non-target language detection if config['language_detection']['enabled']: non_target_detected, non_target_count = self._check_non_target_language( result_clean, config['language_detection'] ) if non_target_detected: print(f"\n 🌐 NON-TARGET LANGUAGE DETECTED!") print(f" Non-target characters found: {non_target_count}") print(f" Threshold: {config['language_detection']['threshold_characters']}") print(f" Target language: {config['language_detection']['target_language']}") print(f" ========== AI HUNTER DEBUG END ==========\n") return True, 100 # High confidence for language detection # Check edge cases if len(result_clean) < config['edge_filters']['min_text_length']: print(f" ⚠️ Text too short ({len(result_clean)} < {config['edge_filters']['min_text_length']})") print(f" ========== AI HUNTER DEBUG END ==========\n") return False, 0 # Extract features print(f" 🔬 Extracting text features...") result_features = self._extract_text_features(result_clean) # Get lookback from main config, then fall back to env var if not found lookback = self.main_config.get('duplicate_lookback_chapters', int(os.getenv('DUPLICATE_LOOKBACK_CHAPTERS', '5'))) # Log configuration print(f"\n 🔧 Configuration:") print(f" Detection mode: {config['detection_mode']}") print(f" Lookback chapters: {lookback}") print(f" Sample size: {config['sample_size']}") # FIX: Get all completed chapters sorted by actual chapter number completed_chapters = [] for chapter_key, chapter_info in prog["chapters"].items(): if chapter_info.get("status") == "completed" and chapter_info.get("output_file"): # Handle both numeric and hash-based chapter keys try: # Get actual_num from progress (this is the real chapter number) chapter_num = chapter_info.get("actual_num") if chapter_num is None: # Try chapter_num as fallback chapter_num = chapter_info.get("chapter_num") if chapter_num is None: # Skip chapters without valid numbers print(f" ⚠️ No chapter number found for key {chapter_key}, skipping") continue completed_chapters.append({ 'key': chapter_key, 'num': chapter_num, 'file': chapter_info.get("output_file"), 'ai_features': chapter_info.get("ai_features") }) except Exception as e: print(f" ⚠️ Error processing chapter {chapter_key}: {e}") continue # Sort by actual chapter number completed_chapters.sort(key=lambda x: x['num']) # If no current chapter number provided, try to infer it if current_chapter_num is None: # The current chapter should be passed in, but if not, we need to find it # Since we're using content hash keys, we can't use idx directly print(f" ⚠️ No current chapter number provided") print(f" 📊 Current index: {idx}") # The current chapter number should have been passed from the wrapper # If it wasn't, we have a problem print(f" ❌ ERROR: Current chapter number not provided to AI Hunter!") print(f" ❌ This indicates the wrapper function is not passing the chapter number correctly") # Emergency: just use a high number so we don't compare against anything current_chapter_num = 999999 print(f" ⚠️ Using index-based chapter number: {current_chapter_num}") print(f"\n 📚 Found {len(completed_chapters)} completed chapters in progress") if completed_chapters: chapter_nums = [ch['num'] for ch in completed_chapters] print(f" 📊 Chapter numbers in progress: {sorted(chapter_nums)[:10]}{'...' if len(chapter_nums) > 10 else ''}") print(f" 🎯 Current chapter number: {current_chapter_num}") print(f" 🔍 Will check against last {lookback} chapters before chapter {current_chapter_num}") # Check previous chapters all_similarities = [] highest_similarity = 0.0 detected_method = None detected_chapter = None # FIX: Look at chapters by actual number, not index chapters_checked = 0 for completed_chapter in reversed(completed_chapters): # Only check chapters that come before the current one if completed_chapter['num'] >= current_chapter_num: continue # Only check up to lookback number of chapters if chapters_checked >= lookback: break chapters_checked += 1 print(f"\n 📝 Checking against chapter {completed_chapter['num']}...") # Get previous chapter features prev_features = completed_chapter.get('ai_features') prev_clean = None # Try to get cached features first if prev_features: print(f" ✅ Using cached features") else: # Read and extract features prev_path = os.path.join(out, completed_chapter['file']) if os.path.exists(prev_path): try: with open(prev_path, 'r', encoding='utf-8') as f: prev_content = f.read() prev_clean = self._preprocess_text(prev_content, config['preprocessing']) # Check length ratio len_ratio = len(result_clean) / max(1, len(prev_clean)) if (len_ratio < config['edge_filters']['min_length_ratio'] or len_ratio > config['edge_filters']['max_length_ratio']): print(f" ⚠️ Length ratio out of bounds: {len_ratio:.2f}") continue prev_features = self._extract_text_features(prev_clean) print(f" 📄 Extracted features from file") except Exception as e: print(f" ❌ Failed to read file: {e}") continue else: print(f" ❌ File not found: {prev_path}") continue # Calculate similarities print(f" 🔍 Calculating similarities...") similarities = self._calculate_all_similarities( result_clean, result_features, prev_clean, prev_features, config ) # Store for reporting all_similarities.append({ 'chapter': completed_chapter['num'], 'similarities': similarities }) # Log similarity scores for method, score in similarities.items(): if score > 0: print(f" {method}: {int(score*100)}%") # Check if duplicate based on configured mode is_duplicate, confidence, methods_triggered = self._evaluate_duplicate( similarities, config ) if is_duplicate: print(f"\n 🚨 DUPLICATE DETECTED!") print(f" Detection mode: {config['detection_mode']}") print(f" Confidence: {int(confidence*100)}%") print(f" Triggered methods: {', '.join(methods_triggered)}") print(f" Match with: Chapter {completed_chapter['num']}") print(f" ========== AI HUNTER DEBUG END ==========\n") return True, int(confidence * 100) # Track highest for reporting for method, sim in similarities.items(): if sim > highest_similarity: highest_similarity = sim detected_method = method detected_chapter = completed_chapter['num'] # No duplicate found print(f"\n ✅ No duplicate found") if detected_method: print(f" Highest similarity: {int(highest_similarity*100)}% via {detected_method}") print(f" Closest match: Chapter {detected_chapter}") # Show top 3 closest matches if all_similarities: print(f"\n 📊 Top 3 closest matches:") sorted_chapters = sorted(all_similarities, key=lambda x: self._get_chapter_score(x['similarities'], config), reverse=True)[:3] for i, chapter_data in enumerate(sorted_chapters, 1): score = self._get_chapter_score(chapter_data['similarities'], config) print(f" {i}. Chapter {chapter_data['chapter']}: {int(score*100)}%") print(f" ========== AI HUNTER DEBUG END ==========\n") return False, 0 except Exception as e: print(f" ❌ AI Hunter detection failed with error: {e}") import traceback print(f" {traceback.format_exc()}") print(f" ========== AI HUNTER DEBUG END ==========\n") return False, 0 def _preprocess_text(self, text, prep_config): """Preprocess text according to configuration""" # Remove HTML if prep_config.get('remove_html_spacing', True): text = re.sub(r'<[^>]+>', ' ', text) else: text = re.sub(r'<[^>]+>', '', text) # Normalize unicode - use NFC to preserve Korean/CJK characters if prep_config.get('normalize_unicode', True): text = unicodedata.normalize('NFC', text) # Remove extra whitespace if prep_config.get('remove_extra_whitespace', True): text = re.sub(r'\s+', ' ', text) text = re.sub(r'\n\s*\n', '\n\n', text) text = text.strip() # Convert to lowercase if case-insensitive if prep_config.get('ignore_case', True): text = text.lower() return text def _calculate_all_similarities(self, result_clean, result_features, prev_clean, prev_features, config): """Calculate all similarity metrics""" similarities = {} # Method 1: Exact content match if prev_clean is not None: sample_size = min(config['sample_size'], len(result_clean), len(prev_clean)) exact_sim = self._calculate_exact_similarity( result_clean[:sample_size], prev_clean[:sample_size] ) similarities['exact'] = exact_sim # Method 2: Smart text similarity text_sim = self._calculate_smart_similarity( result_clean, prev_clean, config['sample_size'] ) similarities['text'] = text_sim else: similarities['exact'] = 0.0 similarities['text'] = 0.0 # Method 3: Semantic fingerprint semantic_sim = self._calculate_semantic_similarity( result_features.get('semantic', {}), prev_features.get('semantic', {}) ) similarities['semantic'] = semantic_sim # Method 4: Structural signature structural_sim = self._calculate_structural_similarity( result_features.get('structural', {}), prev_features.get('structural', {}) ) similarities['structural'] = structural_sim # Method 5: Character analysis char_sim = self._calculate_character_similarity( result_features.get('characters', []), prev_features.get('characters', []) ) similarities['character'] = char_sim # Method 6: Pattern analysis pattern_sim = self._calculate_pattern_similarity( result_features.get('patterns', {}), prev_features.get('patterns', {}) ) similarities['pattern'] = pattern_sim return similarities def _evaluate_duplicate(self, similarities, config): """Evaluate if similarities indicate a duplicate based on detection mode""" mode = config['detection_mode'] thresholds = {k: v/100.0 for k, v in config['thresholds'].items()} if mode == 'single_method': # Any method exceeding threshold for method, sim in similarities.items(): if sim >= thresholds.get(method, 0.85): return True, sim, [method] return False, 0, [] elif mode == 'multi_method': # Multiple methods must agree triggered_methods = [] for method, sim in similarities.items(): if sim >= thresholds.get(method, 0.85): triggered_methods.append(method) # Check if enough methods triggered required = config.get('multi_method_requirements', {}).get('methods_required', 2) min_methods = config.get('multi_method_requirements', {}).get('min_methods', []) if len(triggered_methods) >= required: # Check if at least one required method is included if not min_methods or any(m in triggered_methods for m in min_methods): # Calculate average confidence of triggered methods confidence = sum(similarities[m] for m in triggered_methods) / len(triggered_methods) return True, confidence, triggered_methods return False, 0, [] elif mode == 'weighted_average': # Calculate weighted average weights = config.get('weights', {}) total_weight = sum(weights.get(m, 1.0) for m in similarities) weighted_sum = sum(similarities[m] * weights.get(m, 1.0) for m in similarities) weighted_avg = weighted_sum / total_weight if total_weight > 0 else 0 # Check if weighted average exceeds average threshold avg_threshold = sum(thresholds.values()) / len(thresholds) if thresholds else 0.85 if weighted_avg >= avg_threshold: # Find which methods contributed most triggered = [m for m, sim in similarities.items() if sim >= thresholds.get(m, 0.85)] return True, weighted_avg, triggered return False, 0, [] return False, 0, [] def _get_chapter_score(self, similarities, config): """Calculate overall score for a chapter comparison""" if config['detection_mode'] == 'weighted_average': weights = config.get('weights', {}) total_weight = sum(weights.get(m, 1.0) for m in similarities) return sum(similarities.get(m, 0) * weights.get(m, 1.0) for m in similarities) / total_weight if total_weight > 0 else 0 else: return max(similarities.values()) if similarities else 0 def _extract_text_features(self, text): """Extract multiple features from text for AI Hunter analysis""" features = { 'semantic': {}, 'structural': {}, 'characters': [], 'patterns': {} } # Semantic fingerprint lines = text.split('\n') # Character extraction (names that appear 3+ times) words = re.findall(r'\b[A-Z][a-z]+\b', text) word_freq = Counter(words) features['characters'] = [name for name, count in word_freq.items() if count >= 3 and name not in { 'The', 'A', 'An', 'In', 'On', 'At', 'To', 'From', 'With', 'By', 'For', 'Of', 'As', 'But', 'And', 'Or', 'He', 'She', 'It', 'They', 'We', 'You', 'What', 'When', 'Where', 'Who', 'Why', 'How', 'That', 'This', 'These' }] # Dialogue patterns dialogue_patterns = re.findall(r'"([^"]+)"', text) features['semantic']['dialogue_count'] = len(dialogue_patterns) features['semantic']['dialogue_lengths'] = [len(d) for d in dialogue_patterns[:10]] # Speaker patterns speaker_patterns = re.findall(r'(\w+)\s+(?:said|asked|replied|shouted|whispered)', text.lower()) features['semantic']['speakers'] = list(set(speaker_patterns[:20])) # Number extraction numbers = re.findall(r'\b\d+\b', text) features['patterns']['numbers'] = numbers[:20] # Structural signature para_lengths = [] dialogue_count = 0 for para in text.split('\n\n'): if para.strip(): para_lengths.append(len(para)) if '"' in para: dialogue_count += 1 features['structural']['para_count'] = len(para_lengths) features['structural']['avg_para_length'] = sum(para_lengths) / max(1, len(para_lengths)) features['structural']['dialogue_ratio'] = dialogue_count / max(1, len(para_lengths)) # Create structural pattern string pattern = [] for para in text.split('\n\n')[:20]: # First 20 paragraphs if para.strip(): if '"' in para: pattern.append('D') # Dialogue elif len(para) > 300: pattern.append('L') # Long elif len(para) < 100: pattern.append('S') # Short else: pattern.append('M') # Medium features['structural']['pattern'] = ''.join(pattern) # Action density action_verbs = len(re.findall(r'\b\w+ed\b', text)) features['semantic']['action_density'] = action_verbs / max(1, len(text.split())) # Text length features['semantic']['text_length'] = len(text) return features def _calculate_exact_similarity(self, text1, text2): """Calculate exact text similarity""" return SequenceMatcher(None, text1, text2).ratio() def _calculate_smart_similarity(self, text1, text2, sample_size): """Smart similarity with configurable sample size""" if len(text1) > sample_size * 3 and len(text2) > sample_size * 3: # Use multiple samples samples1 = [ text1[:sample_size], text1[len(text1)//2 - sample_size//2:len(text1)//2 + sample_size//2], text1[-sample_size:] ] samples2 = [ text2[:sample_size], text2[len(text2)//2 - sample_size//2:len(text2)//2 + sample_size//2], text2[-sample_size:] ] similarities = [SequenceMatcher(None, s1, s2).ratio() for s1, s2 in zip(samples1, samples2)] return sum(similarities) / len(similarities) else: # Use full text up to sample size return SequenceMatcher(None, text1[:sample_size], text2[:sample_size]).ratio() def _calculate_semantic_similarity(self, sem1, sem2): """Calculate semantic fingerprint similarity""" score = 0.0 weights = 0.0 # Compare dialogue counts if 'dialogue_count' in sem1 and 'dialogue_count' in sem2: weights += 0.3 if sem1['dialogue_count'] > 0 or sem2['dialogue_count'] > 0: ratio = min(sem1['dialogue_count'], sem2['dialogue_count']) / \ max(1, max(sem1['dialogue_count'], sem2['dialogue_count'])) score += ratio * 0.3 # Compare speakers if 'speakers' in sem1 and 'speakers' in sem2: weights += 0.4 if sem1['speakers'] and sem2['speakers']: overlap = len(set(sem1['speakers']) & set(sem2['speakers'])) total = len(set(sem1['speakers']) | set(sem2['speakers'])) score += (overlap / max(1, total)) * 0.4 elif not sem1['speakers'] and not sem2['speakers']: score += 0.4 # Both have no speakers # Compare dialogue lengths pattern if 'dialogue_lengths' in sem1 and 'dialogue_lengths' in sem2: weights += 0.2 if sem1['dialogue_lengths'] and sem2['dialogue_lengths']: len1 = sem1['dialogue_lengths'][:10] len2 = sem2['dialogue_lengths'][:10] if len1 and len2: avg1 = sum(len1) / len(len1) avg2 = sum(len2) / len(len2) ratio = min(avg1, avg2) / max(1, max(avg1, avg2)) score += ratio * 0.2 elif not sem1['dialogue_lengths'] and not sem2['dialogue_lengths']: score += 0.2 # Both have no dialogue # Action density if 'action_density' in sem1 and 'action_density' in sem2: weights += 0.1 act_sim = 1 - abs(sem1['action_density'] - sem2['action_density']) score += act_sim * 0.1 return score / max(0.1, weights) def _calculate_structural_similarity(self, struct1, struct2): """Calculate structural signature similarity""" score = 0.0 # Compare paragraph patterns if 'pattern' in struct1 and 'pattern' in struct2: pattern_sim = SequenceMatcher(None, struct1['pattern'], struct2['pattern']).ratio() score += pattern_sim * 0.5 # Compare paragraph statistics if all(k in struct1 for k in ['para_count', 'avg_para_length', 'dialogue_ratio']) and \ all(k in struct2 for k in ['para_count', 'avg_para_length', 'dialogue_ratio']): # Paragraph count ratio para_ratio = min(struct1['para_count'], struct2['para_count']) / \ max(1, max(struct1['para_count'], struct2['para_count'])) score += para_ratio * 0.2 # Average length ratio avg_ratio = min(struct1['avg_para_length'], struct2['avg_para_length']) / \ max(1, max(struct1['avg_para_length'], struct2['avg_para_length'])) score += avg_ratio * 0.15 # Dialogue ratio similarity dialogue_diff = abs(struct1['dialogue_ratio'] - struct2['dialogue_ratio']) score += (1 - min(1, dialogue_diff)) * 0.15 return score def _calculate_character_similarity(self, chars1, chars2): """Calculate character overlap similarity""" if not chars1 or not chars2: return 0.0 # Convert to sets set1 = set(chars1) set2 = set(chars2) # If no overlap at all, return 0 intersection = set1 & set2 if not intersection: return 0.0 # Calculate Jaccard index (intersection over union) union = set1 | set2 jaccard = len(intersection) / len(union) # Also consider the proportion of matching characters relative to each set # This prevents small overlaps from scoring too high overlap1 = len(intersection) / len(set1) overlap2 = len(intersection) / len(set2) # Take the minimum overlap to be more conservative min_overlap = min(overlap1, overlap2) # Combine jaccard and overlap scores # Jaccard penalizes when sets are very different sizes # Min overlap ensures both texts share a significant portion of characters score = (jaccard + min_overlap) / 2 return score def _calculate_pattern_similarity(self, pat1, pat2): """Calculate pattern similarity (numbers, etc.)""" score = 0.0 # Number overlap if 'numbers' in pat1 and 'numbers' in pat2: nums1 = set(pat1['numbers']) nums2 = set(pat2['numbers']) if nums1 or nums2: overlap = len(nums1 & nums2) total = len(nums1 | nums2) score = overlap / max(1, total) else: score = 1.0 # Both have no numbers return score def _check_non_target_language(self, text, lang_config): """Check if text contains too much non-target language""" target_language = lang_config['target_language'].lower() threshold = lang_config['threshold_characters'] # Character ranges for different languages language_ranges = { 'english': [ # Latin script + basic symbols (0x0000, 0x007F), # Basic Latin (0x0080, 0x00FF), # Latin-1 Supplement (0x0100, 0x017F), # Latin Extended-A (0x0180, 0x024F), # Latin Extended-B (0x2000, 0x206F), # General Punctuation (0x20A0, 0x20CF), # Currency Symbols (0xFF00, 0xFFEF), # Halfwidth and Fullwidth Forms ], 'japanese': [ (0x3040, 0x309F), # Hiragana (0x30A0, 0x30FF), # Katakana (0x4E00, 0x9FAF), # CJK Unified Ideographs (0x3400, 0x4DBF), # CJK Extension A (0xFF66, 0xFF9F), # Halfwidth Katakana ], 'korean': [ (0xAC00, 0xD7AF), # Hangul Syllables (0x1100, 0x11FF), # Hangul Jamo (0x3130, 0x318F), # Hangul Compatibility Jamo (0xA960, 0xA97F), # Hangul Jamo Extended-A (0xD7B0, 0xD7FF), # Hangul Jamo Extended-B ], 'chinese': [ (0x4E00, 0x9FAF), # CJK Unified Ideographs (0x3400, 0x4DBF), # CJK Extension A (0x20000, 0x2A6DF), # CJK Extension B (0x2A700, 0x2B73F), # CJK Extension C (0x2B740, 0x2B81F), # CJK Extension D (0x3000, 0x303F), # CJK Symbols and Punctuation ], 'arabic': [ (0x0600, 0x06FF), # Arabic (0x0750, 0x077F), # Arabic Supplement (0x08A0, 0x08FF), # Arabic Extended-A (0xFB50, 0xFDFF), # Arabic Presentation Forms-A (0xFE70, 0xFEFF), # Arabic Presentation Forms-B ], 'russian': [ (0x0400, 0x04FF), # Cyrillic (0x0500, 0x052F), # Cyrillic Supplement (0x2DE0, 0x2DFF), # Cyrillic Extended-A (0xA640, 0xA69F), # Cyrillic Extended-B ], 'thai': [ (0x0E00, 0x0E7F), # Thai ], 'hindi': [ (0x0900, 0x097F), # Devanagari (0xA8E0, 0xA8FF), # Devanagari Extended ], 'spanish': [ # Same as English (Latin script) (0x0000, 0x007F), # Basic Latin (0x0080, 0x00FF), # Latin-1 Supplement (0x0100, 0x017F), # Latin Extended-A (0x0180, 0x024F), # Latin Extended-B ], 'french': [ # Same as English (Latin script) (0x0000, 0x007F), # Basic Latin (0x0080, 0x00FF), # Latin-1 Supplement (0x0100, 0x017F), # Latin Extended-A (0x0180, 0x024F), # Latin Extended-B ], 'german': [ # Same as English (Latin script) (0x0000, 0x007F), # Basic Latin (0x0080, 0x00FF), # Latin-1 Supplement (0x0100, 0x017F), # Latin Extended-A (0x0180, 0x024F), # Latin Extended-B ], 'portuguese': [ # Same as English (Latin script) (0x0000, 0x007F), # Basic Latin (0x0080, 0x00FF), # Latin-1 Supplement (0x0100, 0x017F), # Latin Extended-A (0x0180, 0x024F), # Latin Extended-B ], 'italian': [ # Same as English (Latin script) (0x0000, 0x007F), # Basic Latin (0x0080, 0x00FF), # Latin-1 Supplement (0x0100, 0x017F), # Latin Extended-A (0x0180, 0x024F), # Latin Extended-B ], 'dutch': [ # Same as English (Latin script) (0x0000, 0x007F), # Basic Latin (0x0080, 0x00FF), # Latin-1 Supplement (0x0100, 0x017F), # Latin Extended-A (0x0180, 0x024F), # Latin Extended-B ], 'vietnamese': [ (0x0000, 0x007F), # Basic Latin (0x0080, 0x00FF), # Latin-1 Supplement (0x0100, 0x017F), # Latin Extended-A (0x0180, 0x024F), # Latin Extended-B (0x1EA0, 0x1EFF), # Latin Extended Additional (Vietnamese) ], 'turkish': [ (0x0000, 0x007F), # Basic Latin (0x0080, 0x00FF), # Latin-1 Supplement (0x0100, 0x017F), # Latin Extended-A (0x0180, 0x024F), # Latin Extended-B ], 'polish': [ (0x0000, 0x007F), # Basic Latin (0x0080, 0x00FF), # Latin-1 Supplement (0x0100, 0x017F), # Latin Extended-A (0x0180, 0x024F), # Latin Extended-B ], 'swedish': [ # Same as English (Latin script) (0x0000, 0x007F), # Basic Latin (0x0080, 0x00FF), # Latin-1 Supplement (0x0100, 0x017F), # Latin Extended-A (0x0180, 0x024F), # Latin Extended-B ], 'danish': [ # Same as English (Latin script) (0x0000, 0x007F), # Basic Latin (0x0080, 0x00FF), # Latin-1 Supplement (0x0100, 0x017F), # Latin Extended-A (0x0180, 0x024F), # Latin Extended-B ], 'norwegian': [ # Same as English (Latin script) (0x0000, 0x007F), # Basic Latin (0x0080, 0x00FF), # Latin-1 Supplement (0x0100, 0x017F), # Latin Extended-A (0x0180, 0x024F), # Latin Extended-B ], 'finnish': [ # Same as English (Latin script) (0x0000, 0x007F), # Basic Latin (0x0080, 0x00FF), # Latin-1 Supplement (0x0100, 0x017F), # Latin Extended-A (0x0180, 0x024F), # Latin Extended-B ], } # Get target language ranges target_ranges = language_ranges.get(target_language, language_ranges['english']) # Count characters that are NOT in target language ranges non_target_count = 0 total_letters = 0 for char in text: # Skip whitespace, punctuation, and numbers for counting if char.isspace() or char.isdigit(): continue # Count as letter character total_letters += 1 # Check if character is in any target language range char_code = ord(char) is_target_char = any(start <= char_code <= end for start, end in target_ranges) if not is_target_char: non_target_count += 1 # Debug logging if non_target_count > 0: print(f" 🌐 Language detection: {non_target_count}/{total_letters} non-target chars ({target_language})") # Return True if non-target character count exceeds threshold return non_target_count >= threshold, non_target_count