Spaces:

IFMedTechdemo
/

Digital_Prescription

Running

App Files Files Community

Digital_Prescription / app.py

IFMedTechdemo

Create app.py

68a6157 verified 19 days ago

raw

history blame

9.97 kB

	import gradio as gr
	import json
	import tempfile
	import pickle
	import os
	import cv2
	import pandas as pd
	import requests
	import re
	from symspellpy import SymSpell, Verbosity
	from rapidocr import RapidOCR, EngineType, LangCls, LangDet, LangRec, ModelType, OCRVersion

	# Constants
	ANCHOR_PREFIXES = ["tab", "cap", "t."]

	# Medical anchors (TAB/CAP/INJ/etc.)
	ANCHORS = [
	r"tab\.?", r"cap\.?", r"inj\.?", r"syp\.?", r"syr\.?",
	r"sol\.?", r"susp\.?", r"oint\.?", r"crm\.?", r"gel\.?",
	r"drops?", r"powder", r"dragees?", r"t\.?", r"c\.?"
	]
	ANCHOR_PATTERN = re.compile(r"\b(" + "\|".join(ANCHORS) + r")", re.IGNORECASE)

	# Non-medical line patterns (to drop lines early)
	NON_MED_PATTERNS = [
	r"emergency", r"contact", r"please",
	r"nephrologist", r"cardiologist",
	r"opinion", r"inform", r"kftafter", r"prescription",
	r"follow[- ]up", r"dr\.", r"physician", r"clinic",
	r"hospital", r"diagnosed", r"treatment", r"patient",
	r"age[: ]", r"sex[: ]", r"weight[: ]", r"height[: ]",
	r"bp[: ]", r"pulse[: ]", r"temperature[: ]",
	r"investigation", r"advised", r"admission", r"discharge",
	r"report", r"lab[: ]", r"laboratory", r"radiology",
	r"address", r"phone[: ]", r"mobile[: ]", r"email[: ]",
	r"signature", r"regd\.?", r"drugs? prescribed"
	]
	NON_MED_REGEX = re.compile("\|".join(NON_MED_PATTERNS), re.IGNORECASE)

	# Rescue list for drug-like English words
	rescue_list = {"d3", "b12", "k2", "iron", "zinc", "calcium", "vit", "xl"}

	def is_potential_med_line(text: str) -> bool:
	t = text.lower()
	non_med_match = NON_MED_REGEX.search(t)
	if non_med_match:
	return False
	anchor_match = ANCHOR_PATTERN.search(t)
	if not anchor_match:
	return False
	digit_match = re.search(r"\d", t)
	if not digit_match:
	return False
	return True

	def validate_drug_match(term: str, drug_db, drug_token_index):
	"""
	Map SymSpell term -> canonical database drug, or None if noise.
	"""
	if term in drug_db:
	return term
	if term in drug_token_index:
	# pick one canonical name; you can change selection logic if needed
	return sorted(drug_token_index[term])[0]
	return None

	def normalize_anchored_tokens(raw_text: str):
	"""
	Use TAB/CAP/T. as anchors, not something to delete:
	- 'TABCLOPITAB75MG TAB' -> ['clopitab']
	- 'TAB SOBISISTAB' -> ['sobisistab']
	- 'TABSTARPRESSXL25MGTAB' -> ['starpressxl']
	"""
	t = raw_text.lower()
	# Remove dosage and numbers but keep anchor letters
	t = re.sub(r"\d+\s*(mg\|ml\|gm\|%\|u\|mcg)", " ", t)
	t = re.sub(r"\d+", " ", t)
	tokens = t.split()

	normalized = []
	skip_next = False

	for i, tok in enumerate(tokens):
	if skip_next:
	skip_next = False
	continue

	base = tok

	# Case 1: token starts with anchor as prefix (no space)
	for pref in ANCHOR_PREFIXES:
	if base.startswith(pref) and len(base) > len(pref):
	base = base[len(pref):]
	break

	# Case 2: token is pure anchor and should attach to next token
	if base in ["tab", "cap", "t"]:
	if i + 1 < len(tokens):
	merged = tokens[i + 1]
	for pref in ANCHOR_PREFIXES:
	if merged.startswith(pref) and len(merged) > len(pref):
	merged = merged[len(pref):]
	break
	base = merged
	skip_next = True
	else:
	continue

	base = base.strip()
	if len(base) >= 3:
	normalized.append(base)

	return normalized

	def initialize_database():
	data_path = os.path.join(os.path.dirname(__file__), "data/Dataset.csv")
	df = pd.read_csv(data_path)
	drug_db = set(df["Combined_Drugs"].astype(str).str.lower().str.strip())
	sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

	for drug in drug_db:
	d = drug.lower()
	sym_spell.create_dictionary_entry(d, 100000)
	parts = d.split()
	if len(parts) > 1:
	for p in parts:
	if len(p) > 3:
	sym_spell.create_dictionary_entry(p, 100000)

	drug_token_index = {}
	for full in drug_db:
	toks = full.split()
	for tok in toks:
	if len(tok) < 3:
	continue
	drug_token_index.setdefault(tok, set()).add(full)

	# English filter
	try:
	url = (
	"https://raw.githubusercontent.com/first20hours/"
	"google-10000-english/master/google-10000-english-no-swears.txt"
	)
	response = requests.get(url, timeout=10)
	english_vocab = set(response.text.split())
	except Exception:
	english_vocab = {"the", "and", "tab", "cap", "mg", "ml"}

	return {
	'drug_db': drug_db,
	'sym_spell': sym_spell,
	'drug_token_index': drug_token_index,
	'english_vocab': english_vocab,
	'rescue_list': rescue_list,
	'NON_MED_REGEX': NON_MED_REGEX,
	'ANCHOR_PATTERN': ANCHOR_PATTERN,
	'ANCHOR_PREFIXES': ANCHOR_PREFIXES
	}

	def process_image_ocr(image_path):
	# Load cached database
	cache_path = os.path.join(os.path.dirname(__file__), "cache/database_cache.pkl")
	try:
	with open(cache_path, 'rb') as f:
	cache = pickle.load(f)
	drug_db = cache['drug_db']
	sym_spell = cache['sym_spell']
	drug_token_index = cache['drug_token_index']
	english_vocab = cache['english_vocab']
	rescue_list = cache['rescue_list']
	except FileNotFoundError:
	print("Error: database_cache.pkl not found. Initializing database...")
	cache = initialize_database()
	drug_db = cache['drug_db']
	sym_spell = cache['sym_spell']
	drug_token_index = cache['drug_token_index']
	english_vocab = cache['english_vocab']
	rescue_list = cache['rescue_list']

	# Load image using cv2
	img = cv2.imread(image_path)
	if img is None:
	raise ValueError(f"Could not load image from {image_path}")

	# Create RapidOCR engine with default parameters
	ocr_engine = RapidOCR(
	params={
	"Global.max_side_len": 2000,
	"Det.engine_type": EngineType.ONNXRUNTIME,
	"Det.lang_type": LangDet.CH,
	"Det.model_type": ModelType.MOBILE,
	"Det.ocr_version": OCRVersion.PPOCRV4,
	"Cls.engine_type": EngineType.ONNXRUNTIME,
	"Cls.lang_type": LangCls.CH,
	"Cls.model_type": ModelType.MOBILE,
	"Cls.ocr_version": OCRVersion.PPOCRV4,
	"Rec.engine_type": EngineType.ONNXRUNTIME,
	"Rec.lang_type": LangRec.CH,
	"Rec.model_type": ModelType.MOBILE,
	"Rec.ocr_version": OCRVersion.PPOCRV4,
	}
	)

	# Run OCR
	ocr_result = ocr_engine(
	img,
	use_det=True,
	use_cls=True,
	use_rec=True,
	text_score=0.5,
	box_thresh=0.5,
	unclip_ratio=1.6,
	return_word_box=False,
	)

	ocr_data = ocr_result.txts

	found_meds_with_originals = {}

	for item in ocr_data:
	text_lower = item.lower()

	# Strong line-level gate
	if not is_potential_med_line(text_lower):
	continue

	# Skip doctor name lines
	if "dr." in text_lower or "dr " in text_lower:
	continue

	# Anchor-aware tokens
	candidate_tokens = normalize_anchored_tokens(item)

	# Optional SymSpell segmentation on normalized tokens
	if candidate_tokens:
	segmentation = sym_spell.word_segmentation(" ".join(candidate_tokens))
	corrected_string = segmentation.corrected_string
	candidate_tokens = corrected_string.split()

	for word in candidate_tokens:
	if len(word) < 3:
	continue

	if word in english_vocab and word not in rescue_list:
	continue

	# Check for exact match first to avoid false positives from SymSpell corrections
	canonical = validate_drug_match(word, drug_db, drug_token_index)
	if canonical:
	if canonical not in found_meds_with_originals:
	found_meds_with_originals[canonical] = []
	if item not in found_meds_with_originals[canonical]:
	found_meds_with_originals[canonical].append(item)
	continue # Skip SymSpell since exact match found

	suggestions = sym_spell.lookup(
	word, Verbosity.CLOSEST, max_edit_distance=1
	)
	if not suggestions:
	continue

	cand = suggestions[0].term
	canonical = validate_drug_match(cand, drug_db, drug_token_index)
	if not canonical:
	continue # reject noise that is not truly in drug_db

	if canonical not in found_meds_with_originals:
	found_meds_with_originals[canonical] = []
	if item not in found_meds_with_originals[canonical]:
	found_meds_with_originals[canonical].append(item)

	print("\nJSON Output:")
	print(json.dumps(found_meds_with_originals, indent=4))

	return found_meds_with_originals

	def process_prescription(image):
	if image is None:
	return "No image uploaded."
	# Save PIL image to temp file
	with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp:
	image.save(tmp.name)
	result = process_image_ocr(tmp.name)
	return json.dumps(result, indent=4)

	iface = gr.Interface(
	fn=process_prescription,
	inputs=gr.Image(type="pil", label="Upload Prescription Image"),
	outputs=gr.Textbox(label="Extracted Drugs", lines=20),
	title="MediBot - Drug Extraction from Prescriptions",
	description="Upload a prescription image to extract drug information."
	)

	if __name__ == "__main__":
	iface.launch()