IFMedTechdemo's picture
Create app.py
68a6157 verified
raw
history blame
9.97 kB
import gradio as gr
import json
import tempfile
import pickle
import os
import cv2
import pandas as pd
import requests
import re
from symspellpy import SymSpell, Verbosity
from rapidocr import RapidOCR, EngineType, LangCls, LangDet, LangRec, ModelType, OCRVersion
# Constants
ANCHOR_PREFIXES = ["tab", "cap", "t."]
# Medical anchors (TAB/CAP/INJ/etc.)
ANCHORS = [
r"tab\.?", r"cap\.?", r"inj\.?", r"syp\.?", r"syr\.?",
r"sol\.?", r"susp\.?", r"oint\.?", r"crm\.?", r"gel\.?",
r"drops?", r"powder", r"dragees?", r"t\.?", r"c\.?"
]
ANCHOR_PATTERN = re.compile(r"\b(" + "|".join(ANCHORS) + r")", re.IGNORECASE)
# Non-medical line patterns (to drop lines early)
NON_MED_PATTERNS = [
r"emergency", r"contact", r"please",
r"nephrologist", r"cardiologist",
r"opinion", r"inform", r"kftafter", r"prescription",
r"follow[- ]up", r"dr\.", r"physician", r"clinic",
r"hospital", r"diagnosed", r"treatment", r"patient",
r"age[: ]", r"sex[: ]", r"weight[: ]", r"height[: ]",
r"bp[: ]", r"pulse[: ]", r"temperature[: ]",
r"investigation", r"advised", r"admission", r"discharge",
r"report", r"lab[: ]", r"laboratory", r"radiology",
r"address", r"phone[: ]", r"mobile[: ]", r"email[: ]",
r"signature", r"regd\.?", r"drugs? prescribed"
]
NON_MED_REGEX = re.compile("|".join(NON_MED_PATTERNS), re.IGNORECASE)
# Rescue list for drug-like English words
rescue_list = {"d3", "b12", "k2", "iron", "zinc", "calcium", "vit", "xl"}
def is_potential_med_line(text: str) -> bool:
t = text.lower()
non_med_match = NON_MED_REGEX.search(t)
if non_med_match:
return False
anchor_match = ANCHOR_PATTERN.search(t)
if not anchor_match:
return False
digit_match = re.search(r"\d", t)
if not digit_match:
return False
return True
def validate_drug_match(term: str, drug_db, drug_token_index):
"""
Map SymSpell term -> canonical database drug, or None if noise.
"""
if term in drug_db:
return term
if term in drug_token_index:
# pick one canonical name; you can change selection logic if needed
return sorted(drug_token_index[term])[0]
return None
def normalize_anchored_tokens(raw_text: str):
"""
Use TAB/CAP/T. as anchors, not something to delete:
- 'TABCLOPITAB75MG TAB' -> ['clopitab']
- 'TAB SOBISISTAB' -> ['sobisistab']
- 'TABSTARPRESSXL25MGTAB' -> ['starpressxl']
"""
t = raw_text.lower()
# Remove dosage and numbers but keep anchor letters
t = re.sub(r"\d+\s*(mg|ml|gm|%|u|mcg)", " ", t)
t = re.sub(r"\d+", " ", t)
tokens = t.split()
normalized = []
skip_next = False
for i, tok in enumerate(tokens):
if skip_next:
skip_next = False
continue
base = tok
# Case 1: token starts with anchor as prefix (no space)
for pref in ANCHOR_PREFIXES:
if base.startswith(pref) and len(base) > len(pref):
base = base[len(pref):]
break
# Case 2: token is pure anchor and should attach to next token
if base in ["tab", "cap", "t"]:
if i + 1 < len(tokens):
merged = tokens[i + 1]
for pref in ANCHOR_PREFIXES:
if merged.startswith(pref) and len(merged) > len(pref):
merged = merged[len(pref):]
break
base = merged
skip_next = True
else:
continue
base = base.strip()
if len(base) >= 3:
normalized.append(base)
return normalized
def initialize_database():
data_path = os.path.join(os.path.dirname(__file__), "data/Dataset.csv")
df = pd.read_csv(data_path)
drug_db = set(df["Combined_Drugs"].astype(str).str.lower().str.strip())
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
for drug in drug_db:
d = drug.lower()
sym_spell.create_dictionary_entry(d, 100000)
parts = d.split()
if len(parts) > 1:
for p in parts:
if len(p) > 3:
sym_spell.create_dictionary_entry(p, 100000)
drug_token_index = {}
for full in drug_db:
toks = full.split()
for tok in toks:
if len(tok) < 3:
continue
drug_token_index.setdefault(tok, set()).add(full)
# English filter
try:
url = (
"https://raw.githubusercontent.com/first20hours/"
"google-10000-english/master/google-10000-english-no-swears.txt"
)
response = requests.get(url, timeout=10)
english_vocab = set(response.text.split())
except Exception:
english_vocab = {"the", "and", "tab", "cap", "mg", "ml"}
return {
'drug_db': drug_db,
'sym_spell': sym_spell,
'drug_token_index': drug_token_index,
'english_vocab': english_vocab,
'rescue_list': rescue_list,
'NON_MED_REGEX': NON_MED_REGEX,
'ANCHOR_PATTERN': ANCHOR_PATTERN,
'ANCHOR_PREFIXES': ANCHOR_PREFIXES
}
def process_image_ocr(image_path):
# Load cached database
cache_path = os.path.join(os.path.dirname(__file__), "cache/database_cache.pkl")
try:
with open(cache_path, 'rb') as f:
cache = pickle.load(f)
drug_db = cache['drug_db']
sym_spell = cache['sym_spell']
drug_token_index = cache['drug_token_index']
english_vocab = cache['english_vocab']
rescue_list = cache['rescue_list']
except FileNotFoundError:
print("Error: database_cache.pkl not found. Initializing database...")
cache = initialize_database()
drug_db = cache['drug_db']
sym_spell = cache['sym_spell']
drug_token_index = cache['drug_token_index']
english_vocab = cache['english_vocab']
rescue_list = cache['rescue_list']
# Load image using cv2
img = cv2.imread(image_path)
if img is None:
raise ValueError(f"Could not load image from {image_path}")
# Create RapidOCR engine with default parameters
ocr_engine = RapidOCR(
params={
"Global.max_side_len": 2000,
"Det.engine_type": EngineType.ONNXRUNTIME,
"Det.lang_type": LangDet.CH,
"Det.model_type": ModelType.MOBILE,
"Det.ocr_version": OCRVersion.PPOCRV4,
"Cls.engine_type": EngineType.ONNXRUNTIME,
"Cls.lang_type": LangCls.CH,
"Cls.model_type": ModelType.MOBILE,
"Cls.ocr_version": OCRVersion.PPOCRV4,
"Rec.engine_type": EngineType.ONNXRUNTIME,
"Rec.lang_type": LangRec.CH,
"Rec.model_type": ModelType.MOBILE,
"Rec.ocr_version": OCRVersion.PPOCRV4,
}
)
# Run OCR
ocr_result = ocr_engine(
img,
use_det=True,
use_cls=True,
use_rec=True,
text_score=0.5,
box_thresh=0.5,
unclip_ratio=1.6,
return_word_box=False,
)
ocr_data = ocr_result.txts
found_meds_with_originals = {}
for item in ocr_data:
text_lower = item.lower()
# Strong line-level gate
if not is_potential_med_line(text_lower):
continue
# Skip doctor name lines
if "dr." in text_lower or "dr " in text_lower:
continue
# Anchor-aware tokens
candidate_tokens = normalize_anchored_tokens(item)
# Optional SymSpell segmentation on normalized tokens
if candidate_tokens:
segmentation = sym_spell.word_segmentation(" ".join(candidate_tokens))
corrected_string = segmentation.corrected_string
candidate_tokens = corrected_string.split()
for word in candidate_tokens:
if len(word) < 3:
continue
if word in english_vocab and word not in rescue_list:
continue
# Check for exact match first to avoid false positives from SymSpell corrections
canonical = validate_drug_match(word, drug_db, drug_token_index)
if canonical:
if canonical not in found_meds_with_originals:
found_meds_with_originals[canonical] = []
if item not in found_meds_with_originals[canonical]:
found_meds_with_originals[canonical].append(item)
continue # Skip SymSpell since exact match found
suggestions = sym_spell.lookup(
word, Verbosity.CLOSEST, max_edit_distance=1
)
if not suggestions:
continue
cand = suggestions[0].term
canonical = validate_drug_match(cand, drug_db, drug_token_index)
if not canonical:
continue # reject noise that is not truly in drug_db
if canonical not in found_meds_with_originals:
found_meds_with_originals[canonical] = []
if item not in found_meds_with_originals[canonical]:
found_meds_with_originals[canonical].append(item)
print("\nJSON Output:")
print(json.dumps(found_meds_with_originals, indent=4))
return found_meds_with_originals
def process_prescription(image):
if image is None:
return "No image uploaded."
# Save PIL image to temp file
with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp:
image.save(tmp.name)
result = process_image_ocr(tmp.name)
return json.dumps(result, indent=4)
iface = gr.Interface(
fn=process_prescription,
inputs=gr.Image(type="pil", label="Upload Prescription Image"),
outputs=gr.Textbox(label="Extracted Drugs", lines=20),
title="MediBot - Drug Extraction from Prescriptions",
description="Upload a prescription image to extract drug information."
)
if __name__ == "__main__":
iface.launch()