Spaces:

CoffeBank
/

RU_AI_Detector

Running

App Files Files Community

RU_AI_Detector / NN_classifier /simple_binary_classifier.py

CoffeBank

update

20e51d1 8 months ago

raw

history blame contribute delete

36.4 kB

	import numpy as np
	import pandas as pd
	import torch
	import torch.nn as nn
	import torch.optim as optim
	from torch.utils.data import DataLoader, TensorDataset
	from sklearn.model_selection import StratifiedKFold
	from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, precision_recall_fscore_support, confusion_matrix
	from sklearn.preprocessing import StandardScaler, LabelEncoder
	from sklearn.impute import SimpleImputer
	import matplotlib.pyplot as plt
	import json
	import joblib
	import os
	import seaborn as sns
	from scipy import stats
	import time
	import argparse

	def setup_gpu():
	if torch.cuda.is_available():
	return True
	else:
	print("No GPUs found. Using CPU.")
	return False

	GPU_AVAILABLE = setup_gpu()
	DEVICE = torch.device('cuda' if GPU_AVAILABLE else 'cpu')

	def load_data_from_json(directory_path):
	if os.path.isfile(directory_path):
	directory = os.path.dirname(directory_path)
	else:
	directory = directory_path

	print(f"Loading JSON files from directory: {directory}")

	json_files = [os.path.join(directory, f) for f in os.listdir(directory)
	if f.endswith('.json') and os.path.isfile(os.path.join(directory, f))]

	if not json_files:
	raise ValueError(f"No JSON files found in directory {directory}")

	print(f"Found {len(json_files)} JSON files")

	all_data = []
	for file_path in json_files:
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	data_dict = json.load(f)
	if 'data' in data_dict:
	all_data.extend(data_dict['data'])
	else:
	print(f"Warning: 'data' key not found in {os.path.basename(file_path)}")
	except Exception as e:
	print(f"Error loading {os.path.basename(file_path)}: {str(e)}")

	if not all_data:
	raise ValueError("Failed to load data from JSON files")

	df = pd.DataFrame(all_data)

	label_mapping = {
	'ai': 'AI',
	'human': 'Human',
	'ai+rew': 'AI',
	}

	if 'source' in df.columns:
	df['label'] = df['source'].map(lambda x: label_mapping.get(x, x))
	else:
	print("Warning: 'source' column not found, using default label")
	df['label'] = 'Unknown'

	valid_labels = ['AI', 'Human']
	df = df[df['label'].isin(valid_labels)]

	print(f"Filtered to {len(df)} examples with labels: {valid_labels}")
	print(f"Label distribution: {df['label'].value_counts().to_dict()}")

	return df

	class Medium_Binary_Network(nn.Module):
	def __init__(self, input_size, hidden_sizes=[256, 128, 64, 32], dropout=0.3):
	super(Medium_Binary_Network, self).__init__()

	layers = []
	prev_size = input_size

	for hidden_size in hidden_sizes:
	layers.append(nn.Linear(prev_size, hidden_size))
	layers.append(nn.ReLU())
	layers.append(nn.Dropout(dropout))
	prev_size = hidden_size

	layers.append(nn.Linear(prev_size, 2))

	self.model = nn.Sequential(*layers)

	def forward(self, x):
	return self.model(x)

	def cross_validate_simple_classifier(directory_path="experiments/results/two_scores_with_long_text_analyze_2048T",
	feature_config=None,
	n_splits=5,
	random_state=42,
	epochs=100,
	hidden_sizes=[256, 128, 64, 32],
	dropout=0.3,
	early_stopping_patience=10):
	print("\n" + "="*50)
	print("MEDIUM BINARY CLASSIFIER CROSS-VALIDATION")
	print("="*50)

	if feature_config is None:
	feature_config = {
	'basic_scores': True,
	'basic_text_stats': ['total_tokens', 'total_words', 'unique_words', 'stop_words', 'avg_word_length'],
	'morphological': ['pos_distribution', 'unique_lemmas', 'lemma_word_ratio'],
	'syntactic': ['dependencies', 'noun_chunks'],
	'entities': ['total_entities', 'entity_types'],
	'diversity': ['ttr', 'mtld'],
	'structure': ['sentence_count', 'avg_sentence_length', 'question_sentences', 'exclamation_sentences'],
	'readability': ['words_per_sentence', 'syllables_per_word', 'flesh_kincaid_score', 'long_words_percent'],
	'semantic': True
	}

	df = load_data_from_json(directory_path)

	features_df = select_features(df, feature_config)
	print(f"Selected {len(features_df.columns)} features")

	imputer = SimpleImputer(strategy='mean')
	X = imputer.fit_transform(features_df)

	label_encoder = LabelEncoder()
	y = label_encoder.fit_transform(df['label'].values)

	skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

	fold_metrics = []
	fold_models = []

	all_train_losses = []
	all_val_losses = []
	all_train_accs = []
	all_val_accs = []

	all_y_true = []
	all_y_pred = []

	best_fold_score = -1
	best_fold_index = -1

	for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
	print(f"\n{'='20} Fold {fold+1}/{n_splits} {'='20}")

	X_train, X_test = X[train_idx], X[test_idx]
	y_train, y_test = y[train_idx], y[test_idx]

	scaler = StandardScaler()
	X_train_scaled = scaler.fit_transform(X_train)
	X_test_scaled = scaler.transform(X_test)

	X_train_tensor = torch.FloatTensor(X_train_scaled).to(DEVICE)
	y_train_tensor = torch.LongTensor(y_train).to(DEVICE)
	X_test_tensor = torch.FloatTensor(X_test_scaled).to(DEVICE)
	y_test_tensor = torch.LongTensor(y_test).to(DEVICE)

	train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
	train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

	model = Medium_Binary_Network(X_train_scaled.shape[1], hidden_sizes=hidden_sizes, dropout=dropout).to(DEVICE)
	print(f"Model created with {len(hidden_sizes)} hidden layers: {hidden_sizes}")

	criterion = nn.CrossEntropyLoss()
	optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
	scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)

	best_val_loss = float('inf')
	patience_counter = 0
	best_model_state = None

	train_losses = []
	val_losses = []
	train_accs = []
	val_accs = []

	for epoch in range(epochs):
	model.train()
	running_loss = 0.0
	running_corrects = 0

	for inputs, labels in train_loader:
	optimizer.zero_grad()
	outputs = model(inputs)
	loss = criterion(outputs, labels)
	loss.backward()
	optimizer.step()

	_, preds = torch.max(outputs, 1)
	running_loss += loss.item() * inputs.size(0)
	running_corrects += torch.sum(preds == labels).item()

	epoch_loss = running_loss / len(train_loader.dataset)
	epoch_acc = running_corrects / len(train_loader.dataset)
	train_losses.append(epoch_loss)
	train_accs.append(epoch_acc)

	model.eval()
	with torch.no_grad():
	val_outputs = model(X_test_tensor)
	val_loss = criterion(val_outputs, y_test_tensor)
	val_losses.append(val_loss.item())

	_, val_preds = torch.max(val_outputs, 1)
	val_acc = torch.sum(val_preds == y_test_tensor).item() / len(y_test_tensor)
	val_accs.append(val_acc)

	if val_loss < best_val_loss:
	best_val_loss = val_loss
	patience_counter = 0
	best_model_state = model.state_dict().copy()
	else:
	patience_counter += 1

	if patience_counter >= early_stopping_patience:
	print(f"Early stopping at epoch {epoch+1}")
	break

	scheduler.step(val_loss)

	if (epoch + 1) % 10 == 0 or epoch == 0:
	print(f"Epoch {epoch+1}/{epochs}, Train Loss: {epoch_loss:.4f}, Train Acc: {epoch_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

	if best_model_state:
	model.load_state_dict(best_model_state)
	print("Loaded best model weights")

	model.eval()
	with torch.no_grad():
	test_outputs = model(X_test_tensor)
	_, predicted = torch.max(test_outputs, 1)
	test_acc = torch.sum(predicted == y_test_tensor).item() / len(y_test_tensor)

	y_test_np = y_test
	predicted_np = predicted.cpu().numpy()

	all_y_true.extend(y_test_np)
	all_y_pred.extend(predicted_np)

	precision, recall, f1, _ = precision_recall_fscore_support(y_test_np, predicted_np, average='weighted')

	fold_metric = {
	'fold': fold + 1,
	'accuracy': float(test_acc),
	'precision': float(precision),
	'recall': float(recall),
	'f1': float(f1),
	'val_loss': float(best_val_loss)
	}

	fold_metrics.append(fold_metric)

	fold_models.append({
	'model': model,
	'scaler': scaler,
	'label_encoder': label_encoder,
	'imputer': imputer,
	'score': test_acc
	})

	if test_acc > best_fold_score:
	best_fold_score = test_acc
	best_fold_index = fold

	all_train_losses.extend(train_losses)
	all_val_losses.extend(val_losses)
	all_train_accs.extend(train_accs)
	all_val_accs.extend(val_accs)

	print(f"Fold {fold+1} Results:")
	print(f" Accuracy: {test_acc:.4f}")
	print(f" Precision: {precision:.4f}")
	print(f" Recall: {recall:.4f}")
	print(f" F1 Score: {f1:.4f}")

	overall_accuracy = accuracy_score(all_y_true, all_y_pred)
	overall_precision, overall_recall, overall_f1, _ = precision_recall_fscore_support(
	all_y_true, all_y_pred, average='weighted'
	)

	fold_accuracies = [metrics['accuracy'] for metrics in fold_metrics]
	mean_accuracy = np.mean(fold_accuracies)
	std_accuracy = np.std(fold_accuracies)

	ci_lower = mean_accuracy - 1.96 * std_accuracy / np.sqrt(n_splits)
	ci_upper = mean_accuracy + 1.96 * std_accuracy / np.sqrt(n_splits)

	plot_learning_curve(all_train_losses, all_val_losses)
	plot_accuracy_curve(all_train_accs, all_val_accs)

	class_names = label_encoder.classes_
	cm = confusion_matrix(all_y_true, all_y_pred)
	plt.figure(figsize=(10, 8))
	sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
	xticklabels=class_names,
	yticklabels=class_names)
	plt.title('Binary Classification Confusion Matrix (Cross-Validation)')
	plt.ylabel('True Label')
	plt.xlabel('Predicted Label')
	os.makedirs('plots/binary', exist_ok=True)
	plt.savefig('plots/binary/confusion_matrix_medium.png')
	plt.close()

	print("\n" + "="*50)
	print("CROSS-VALIDATION SUMMARY")
	print("="*50)
	print(f"Mean Accuracy: {mean_accuracy:.4f} ± {std_accuracy:.4f}")
	print(f"95% Confidence Interval: [{ci_lower:.4f}, {ci_upper:.4f}]")
	print(f"Overall Accuracy: {overall_accuracy:.4f}")
	print(f"Overall Precision: {overall_precision:.4f}")
	print(f"Overall Recall: {overall_recall:.4f}")
	print(f"Overall F1: {overall_f1:.4f}")

	print(f"\nBest Fold: {best_fold_index + 1} (Accuracy: {fold_metrics[best_fold_index]['accuracy']:.4f})")

	best_model_data = fold_models[best_fold_index]

	results = {
	'fold_metrics': fold_metrics,
	'overall': {
	'accuracy': float(overall_accuracy),
	'precision': float(overall_precision),
	'recall': float(overall_recall),
	'f1': float(overall_f1)
	},
	'cross_validation': {
	'mean_accuracy': float(mean_accuracy),
	'std_accuracy': float(std_accuracy),
	'confidence_interval_95': [float(ci_lower), float(ci_upper)]
	},
	'best_fold': {
	'fold': best_fold_index + 1,
	'accuracy': float(fold_metrics[best_fold_index]['accuracy'])
	},
	'model_config': {
	'hidden_sizes': hidden_sizes,
	'dropout': dropout
	}
	}

	output_dir = 'models/medium_binary_classifier'
	save_paths = save_binary_model(best_model_data, results, output_dir=output_dir)

	return best_model_data, results, save_paths

	def plot_learning_curve(train_losses, val_losses):
	plt.figure(figsize=(10, 6))
	epochs = range(1, len(train_losses) + 1)

	plt.plot(epochs, train_losses, 'b-', label='Training Loss')
	plt.plot(epochs, val_losses, 'r-', label='Validation Loss')

	plt.title('Learning Curve')
	plt.xlabel('Epochs')
	plt.ylabel('Loss')
	plt.legend()
	plt.grid(True)

	os.makedirs('plots/binary', exist_ok=True)
	plt.savefig('plots/binary/learning_curve.png')
	plt.close()
	print("Learning curve saved to plots/binary/learning_curve.png")

	def plot_accuracy_curve(train_accuracies, val_accuracies):
	plt.figure(figsize=(10, 6))
	epochs = range(1, len(train_accuracies) + 1)

	plt.plot(epochs, train_accuracies, 'g-', label='Training Accuracy')
	plt.plot(epochs, val_accuracies, 'm-', label='Validation Accuracy')

	plt.title('Accuracy Curve')
	plt.xlabel('Epochs')
	plt.ylabel('Accuracy')
	plt.legend()
	plt.grid(True)

	plt.ylim(0, 1.0)

	os.makedirs('plots/binary', exist_ok=True)
	plt.savefig('plots/binary/accuracy_curve.png')
	plt.close()
	print("Accuracy curve saved to plots/binary/accuracy_curve.png")

	def select_features(df, feature_config):
	features_df = pd.DataFrame()

	if feature_config.get('basic_scores', True):
	if 'score_chat' in df.columns:
	features_df['score_chat'] = df['score_chat']
	if 'score_coder' in df.columns:
	features_df['score_coder'] = df['score_coder']

	if 'text_analysis' in df.columns:
	if feature_config.get('basic_text_stats'):
	for feature in feature_config['basic_text_stats']:
	features_df[f'basic_{feature}'] = df['text_analysis'].apply(
	lambda x: x.get('basic_stats', {}).get(feature, 0) if isinstance(x, dict) else 0
	)

	if feature_config.get('morphological'):
	for feature in feature_config['morphological']:
	if feature == 'pos_distribution':
	pos_types = ['NOUN', 'VERB', 'ADJ', 'ADV', 'PROPN', 'DET', 'ADP', 'PRON', 'CCONJ', 'SCONJ']
	for pos in pos_types:
	features_df[f'pos_{pos}'] = df['text_analysis'].apply(
	lambda x: x.get('morphological_analysis', {}).get('pos_distribution', {}).get(pos, 0)
	if isinstance(x, dict) else 0
	)
	else:
	features_df[f'morph_{feature}'] = df['text_analysis'].apply(
	lambda x: x.get('morphological_analysis', {}).get(feature, 0) if isinstance(x, dict) else 0
	)

	if feature_config.get('syntactic'):
	for feature in feature_config['syntactic']:
	if feature == 'dependencies':
	dep_types = ['nsubj', 'obj', 'amod', 'nmod', 'ROOT', 'punct', 'case']
	for dep in dep_types:
	features_df[f'dep_{dep}'] = df['text_analysis'].apply(
	lambda x: x.get('syntactic_analysis', {}).get('dependencies', {}).get(dep, 0)
	if isinstance(x, dict) else 0
	)
	else:
	features_df[f'synt_{feature}'] = df['text_analysis'].apply(
	lambda x: x.get('syntactic_analysis', {}).get(feature, 0) if isinstance(x, dict) else 0
	)

	if feature_config.get('entities'):
	for feature in feature_config['entities']:
	if feature == 'entity_types':
	entity_types = ['PER', 'LOC', 'ORG']
	for ent in entity_types:
	features_df[f'ent_{ent}'] = df['text_analysis'].apply(
	lambda x: x.get('named_entities', {}).get('entity_types', {}).get(ent, 0)
	if isinstance(x, dict) else 0
	)
	else:
	features_df[f'ent_{feature}'] = df['text_analysis'].apply(
	lambda x: x.get('named_entities', {}).get(feature, 0) if isinstance(x, dict) else 0
	)

	if feature_config.get('diversity'):
	for feature in feature_config['diversity']:
	features_df[f'div_{feature}'] = df['text_analysis'].apply(
	lambda x: x.get('lexical_diversity', {}).get(feature, 0) if isinstance(x, dict) else 0
	)

	if feature_config.get('structure'):
	for feature in feature_config['structure']:
	features_df[f'struct_{feature}'] = df['text_analysis'].apply(
	lambda x: x.get('text_structure', {}).get(feature, 0) if isinstance(x, dict) else 0
	)

	if feature_config.get('readability'):
	for feature in feature_config['readability']:
	features_df[f'read_{feature}'] = df['text_analysis'].apply(
	lambda x: x.get('readability', {}).get(feature, 0) if isinstance(x, dict) else 0
	)

	if feature_config.get('semantic'):
	features_df['semantic_coherence'] = df['text_analysis'].apply(
	lambda x: x.get('semantic_coherence', {}).get('avg_coherence_score', 0) if isinstance(x, dict) else 0
	)

	print(f"Generated {len(features_df.columns)} features")
	return features_df

	def augment_text_features(features_df, num_augmentations=5, noise_factor=0.05):
	augmented_dfs = [features_df]

	for i in range(num_augmentations):
	numeric_cols = features_df.select_dtypes(include=[np.number]).columns
	augmented_df = features_df.copy()
	for col in numeric_cols:
	augmented_df[col] = augmented_df[col].astype(float)

	noise = augmented_df[numeric_cols] * np.random.normal(0, noise_factor, size=augmented_df[numeric_cols].shape)
	augmented_df[numeric_cols] += noise
	augmented_dfs.append(augmented_df)

	return pd.concat(augmented_dfs, ignore_index=True)

	def cross_validate_binary_classifier(directory_path="experiments/results/two_scores_with_long_text_analyze_2048T",
	model_config=None,
	feature_config=None,
	n_splits=5,
	random_state=42,
	epochs=100,
	early_stopping_patience=10,
	use_augmentation=True,
	num_augmentations=2,
	noise_factor=0.05):
	if model_config is None:
	model_config = {
	'hidden_layers': [256, 128, 64],
	'dropout_rate': 0.3
	}

	if feature_config is None:
	feature_config = {
	'basic_scores': True,
	'basic_text_stats': ['total_tokens', 'total_words', 'unique_words', 'stop_words', 'avg_word_length'],
	'morphological': ['pos_distribution', 'unique_lemmas', 'lemma_word_ratio'],
	'syntactic': ['dependencies', 'noun_chunks'],
	'entities': ['total_entities', 'entity_types'],
	'diversity': ['ttr', 'mtld'],
	'structure': ['sentence_count', 'avg_sentence_length', 'question_sentences', 'exclamation_sentences'],
	'readability': ['words_per_sentence', 'syllables_per_word', 'flesh_kincaid_score', 'long_words_percent'],
	'semantic': True
	}

	print("\n" + "="*50)
	print("BINARY CLASSIFIER CROSS-VALIDATION")
	print("="*50)

	df = load_data_from_json(directory_path)

	features_df = select_features(df, feature_config)
	print(f"Selected features: {features_df.columns.tolist()}")

	imputer = SimpleImputer(strategy='mean')

	if use_augmentation:
	print(f"Augmenting data with {num_augmentations} copies (noise factor: {noise_factor})...")
	original_size = len(features_df)
	features_df_augmented = augment_text_features(features_df,
	num_augmentations=num_augmentations,
	noise_factor=noise_factor)
	y_augmented = np.tile(df['label'].values, num_augmentations + 1)
	print(f"Data size increased from {original_size} to {len(features_df_augmented)}")

	X = imputer.fit_transform(features_df_augmented)
	y = y_augmented
	else:
	X = imputer.fit_transform(features_df)
	y = df['label'].values

	label_encoder = LabelEncoder()
	y_encoded = label_encoder.fit_transform(y)

	print(f"Data size: {X.shape}")
	print(f"Labels distribution: {pd.Series(y).value_counts().to_dict()}")

	skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

	fold_metrics = []
	fold_models = []
	all_y_true = []
	all_y_pred = []
	all_y_scores = []

	best_fold_score = -1
	best_fold_index = -1

	print(f"\nPerforming {n_splits}-fold cross-validation...")

	num_avg_epochs = 5
	saved_weights = []

	for fold, (train_idx, test_idx) in enumerate(skf.split(X, y_encoded)):
	print(f"\n{'='20} Fold {fold+1}/{n_splits} {'='20}")

	X_train, X_test = X[train_idx], X[test_idx]
	y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]

	scaler = StandardScaler()
	X_train_scaled = scaler.fit_transform(X_train)
	X_test_scaled = scaler.transform(X_test)

	X_train_tensor = torch.FloatTensor(X_train_scaled).to(DEVICE)
	y_train_tensor = torch.LongTensor(y_train).to(DEVICE)
	X_test_tensor = torch.FloatTensor(X_test_scaled).to(DEVICE)
	y_test_tensor = torch.LongTensor(y_test).to(DEVICE)

	train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
	train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

	num_classes = len(label_encoder.classes_)
	model = build_neural_network(X_train_scaled.shape[1], num_classes,
	hidden_layers=model_config['hidden_layers'])

	criterion = nn.CrossEntropyLoss()
	optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
	scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)

	best_val_loss = float('inf')
	patience_counter = 0
	best_model_state = None

	train_losses = []
	val_losses = []

	saved_weights = []

	for epoch in range(epochs):
	model.train()
	running_loss = 0.0

	for inputs, labels in train_loader:
	optimizer.zero_grad()
	outputs = model(inputs)
	loss = criterion(outputs, labels)
	loss.backward()
	optimizer.step()

	running_loss += loss.item() * inputs.size(0)

	epoch_loss = running_loss / len(train_loader.dataset)
	train_losses.append(epoch_loss)

	model.eval()
	with torch.no_grad():
	val_outputs = model(X_test_tensor)
	val_loss = criterion(val_outputs, y_test_tensor)
	val_losses.append(val_loss.item())

	if val_loss < best_val_loss:
	best_val_loss = val_loss
	patience_counter = 0
	best_model_state = model.state_dict().copy()
	else:
	patience_counter += 1

	if patience_counter >= early_stopping_patience:
	print(f"Early stopping at epoch {epoch+1}")
	break

	if epoch >= epochs - num_avg_epochs:
	saved_weights.append(model.state_dict().copy())

	scheduler.step()

	if (epoch + 1) % 10 == 0 or epoch == 0:
	print(f"Epoch {epoch+1}/{epochs}, Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}")

	if len(saved_weights) > 0:
	print(f"Averaging weights from last {len(saved_weights)} epochs...")
	avg_state_dict = saved_weights[0].copy()
	for key in avg_state_dict.keys():
	if epoch >= epochs - num_avg_epochs:
	for i in range(1, len(saved_weights)):
	avg_state_dict[key] += saved_weights[i][key]
	avg_state_dict[key] /= len(saved_weights)

	model.load_state_dict(avg_state_dict)
	print("Model loaded with averaged weights")
	elif best_model_state:
	model.load_state_dict(best_model_state)
	print("Model loaded with best validation weights")

	model.eval()
	with torch.no_grad():
	test_outputs = model(X_test_tensor)
	_, predicted = torch.max(test_outputs.data, 1)
	predicted_np = predicted.cpu().numpy()

	probabilities = torch.softmax(test_outputs, dim=1)
	pos_scores = probabilities[:, 1].cpu().numpy()

	all_y_true.extend(y_test)
	all_y_pred.extend(predicted_np)
	all_y_scores.extend(pos_scores)

	fold_acc = accuracy_score(y_test, predicted_np)
	precision, recall, f1, _ = precision_recall_fscore_support(y_test, predicted_np, average='weighted')

	try:
	fold_auc = roc_auc_score(y_test, pos_scores)
	except:
	fold_auc = 0.0
	print("Warning: Could not compute AUC")

	fold_metrics.append({
	'fold': fold + 1,
	'accuracy': float(fold_acc),
	'precision': float(precision),
	'recall': float(recall),
	'f1': float(f1),
	'auc': float(fold_auc),
	'best_val_loss': float(best_val_loss)
	})

	fold_models.append({
	'model': model,
	'scaler': scaler,
	'label_encoder': label_encoder,
	'imputer': imputer,
	'score': fold_acc
	})

	if fold_acc > best_fold_score:
	best_fold_score = fold_acc
	best_fold_index = fold

	print(f"Fold {fold+1} Results:")
	print(f" Accuracy: {fold_acc:.4f}")
	print(f" Precision: {precision:.4f}")
	print(f" Recall: {recall:.4f}")
	print(f" F1 Score: {f1:.4f}")
	if fold_auc > 0:
	print(f" AUC: {fold_auc:.4f}")

	overall_accuracy = accuracy_score(all_y_true, all_y_pred)
	overall_precision, overall_recall, overall_f1, _ = precision_recall_fscore_support(
	all_y_true, all_y_pred, average='weighted'
	)

	try:
	overall_auc = roc_auc_score(all_y_true, all_y_scores)
	except:
	overall_auc = 0.0
	print("Warning: Could not compute overall AUC")

	fold_accuracies = [metrics['accuracy'] for metrics in fold_metrics]
	mean_accuracy = np.mean(fold_accuracies)
	std_accuracy = np.std(fold_accuracies)

	ci_lower = mean_accuracy - 1.96 * std_accuracy / np.sqrt(n_splits)
	ci_upper = mean_accuracy + 1.96 * std_accuracy / np.sqrt(n_splits)

	class_counts = np.bincount(y_encoded)
	baseline_accuracy = np.max(class_counts) / len(y_encoded)
	most_frequent_class = np.argmax(class_counts)

	t_stat, p_value = stats.ttest_1samp(fold_accuracies, baseline_accuracy)

	best_model_data = fold_models[best_fold_index]

	class_names = label_encoder.classes_
	cm = confusion_matrix(all_y_true, all_y_pred)
	plt.figure(figsize=(10, 8))
	sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
	xticklabels=class_names,
	yticklabels=class_names)
	plt.title('Binary Classification Confusion Matrix (Cross-Validation)')
	plt.ylabel('True Label')
	plt.xlabel('Predicted Label')
	os.makedirs('plots/binary', exist_ok=True)
	plt.savefig('plots/binary/confusion_matrix_cv.png')
	plt.close()

	if overall_auc > 0:
	from sklearn.metrics import roc_curve
	fpr, tpr, _ = roc_curve(all_y_true, all_y_scores)
	plt.figure(figsize=(10, 8))
	plt.plot(fpr, tpr, lw=2, label=f'ROC curve (AUC = {overall_auc:.4f})')
	plt.plot([0, 1], [0, 1], 'k--', lw=2)
	plt.xlim([0.0, 1.0])
	plt.ylim([0.0, 1.05])
	plt.xlabel('False Positive Rate')
	plt.ylabel('True Positive Rate')
	plt.title('Receiver Operating Characteristic (ROC)')
	plt.legend(loc="lower right")
	plt.savefig('plots/binary/roc_curve.png')
	plt.close()

	results = {
	'fold_metrics': fold_metrics,
	'overall': {
	'accuracy': float(overall_accuracy),
	'precision': float(overall_precision),
	'recall': float(overall_recall),
	'f1': float(overall_f1),
	'auc': float(overall_auc) if overall_auc > 0 else None
	},
	'cross_validation': {
	'mean_accuracy': float(mean_accuracy),
	'std_accuracy': float(std_accuracy),
	'confidence_interval_95': [float(ci_lower), float(ci_upper)],
	'baseline_accuracy': float(baseline_accuracy),
	'most_frequent_class': str(label_encoder.inverse_transform([most_frequent_class])[0]),
	't_statistic': float(t_stat),
	'p_value': float(p_value),
	'statistically_significant': "yes" if p_value < 0.05 else "no"
	},
	'best_fold': {
	'fold': best_fold_index + 1,
	'accuracy': float(fold_metrics[best_fold_index]['accuracy'])
	}
	}

	print("\n" + "="*50)
	print("CROSS-VALIDATION SUMMARY")
	print("="*50)
	print(f"Mean Accuracy: {mean_accuracy:.4f} ± {std_accuracy:.4f}")
	print(f"95% Confidence Interval: [{ci_lower:.4f}, {ci_upper:.4f}]")
	print(f"Overall Accuracy: {overall_accuracy:.4f}")
	print(f"Baseline Accuracy: {baseline_accuracy:.4f} (most frequent class: {label_encoder.inverse_transform([most_frequent_class])[0]})")
	print(f"T-statistic: {t_stat:.4f}, p-value: {p_value:.6f}")

	if p_value < 0.05:
	print("The model is significantly better than the baseline (p < 0.05)")
	else:
	print("The model is NOT significantly better than the baseline (p >= 0.05)")

	print(f"\nBest Fold: {best_fold_index + 1} (Accuracy: {fold_metrics[best_fold_index]['accuracy']:.4f})")

	return best_model_data, results

	def save_binary_model(model_data, results, output_dir='models/binary_classifier'):
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	model_path = os.path.join(output_dir, 'nn_model.pt')
	torch.save(model_data['model'].state_dict(), model_path)

	scaler_path = os.path.join(output_dir, 'scaler.joblib')
	joblib.dump(model_data['scaler'], scaler_path)

	encoder_path = os.path.join(output_dir, 'label_encoder.joblib')
	joblib.dump(model_data['label_encoder'], encoder_path)

	imputer_path = os.path.join(output_dir, 'imputer.joblib')
	joblib.dump(model_data['imputer'], imputer_path)

	results_path = os.path.join(output_dir, 'cv_results.json')
	with open(results_path, 'w') as f:
	json.dump(results, f, indent=4)

	print(f"Binary model saved to {model_path}")
	print(f"CV results saved to {results_path}")

	return {
	'model_path': model_path,
	'scaler_path': scaler_path,
	'encoder_path': encoder_path,
	'imputer_path': imputer_path,
	'results_path': results_path
	}

	def parse_args():
	parser = argparse.ArgumentParser(description='Binary Neural Network Classifier (Human vs AI) with Cross-Validation')
	parser.add_argument('--random_seed', type=int, default=42,
	help='Random seed for reproducibility')
	parser.add_argument('--folds', type=int, default=5,
	help='Number of cross-validation folds')
	parser.add_argument('--epochs', type=int, default=100,
	help='Maximum number of training epochs per fold')
	parser.add_argument('--patience', type=int, default=10,
	help='Early stopping patience (epochs)')
	return parser.parse_args()

	def main():
	print("\n" + "="*50)
	print("MEDIUM BINARY CLASSIFIER")
	print("="*50 + "\n")

	args = parse_args()

	seed = args.random_seed
	np.random.seed(seed)
	torch.manual_seed(seed)
	if GPU_AVAILABLE:
	torch.cuda.manual_seed_all(seed)

	plt.switch_backend('agg')

	feature_config = {
	'basic_scores': True,
	'basic_text_stats': ['total_tokens', 'total_words', 'unique_words', 'stop_words', 'avg_word_length'],
	'morphological': ['pos_distribution', 'unique_lemmas', 'lemma_word_ratio'],
	'syntactic': ['dependencies', 'noun_chunks'],
	'entities': ['total_entities', 'entity_types'],
	'diversity': ['ttr', 'mtld'],
	'structure': ['sentence_count', 'avg_sentence_length', 'question_sentences', 'exclamation_sentences'],
	'readability': ['words_per_sentence', 'syllables_per_word', 'flesh_kincaid_score', 'long_words_percent'],
	'semantic': True
	}

	model_data, results, save_paths = cross_validate_simple_classifier(
	directory_path="experiments/results/two_scores_with_long_text_analyze_2048T",
	feature_config=feature_config,
	n_splits=5,
	random_state=seed,
	epochs=150,
	hidden_sizes=[256, 192, 128, 64],
	dropout=0.3,
	early_stopping_patience=15
	)

	print("\nTraining completed.")
	print(f"Medium binary classifier saved to {save_paths['model_path']}")

	if __name__ == "__main__":
	main()