Source code for nmrezman.phase01.classify.classifier

# %%

# Models and tokenizers
import pickle 
import joblib
import nltk
import numpy as np	
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import imblearn

# nmrezman
from ...utils import preprocess_input

# Misc
import os

# Typing
from typing import Dict
from imblearn.pipeline import Pipeline


# %%

def extract_comments(clean_note: str, comments_model: Pipeline) -> str:
    """
    For reports with findings, extract the related comment that led to the finding.

    Args:
        clean_note (`str`):
            Preprocessed note
        comments_model (`imblearn.pipeline.Pipeline`):
            Pretrained comments model
    """

    # Define the tokenizer
    nltk.download('punkt')
    nltk_token = nltk.tokenize

    # Get a list of all the tokenized sentences in the the clean note
    comments_X = nltk_token.sent_tokenize(clean_note)

    # Get the highest probability predicted comment
    comment_pred_probs = [comments_model.predict_proba([i.encode("utf-8")])[:, 1][0] for i in comments_X]
    idx = comment_pred_probs.index(max(comment_pred_probs))
    predicted_comment = comments_X[idx]

    return comments_X[idx-1] + predicted_comment


[docs]def classifier( data: str, model_path: str, ) -> Dict[str, object]: """ Results Management Classifier using biLSTMs according to Phase 01 of the project. Args: data (`str`): Radiologist report model_path (`str`): Path to the folder with model checkpoints and tokenizer .. note:: The model weights and tokenizer should be located in the specified folder as: - ``findings_best_model.h5`` - ``comment_best_model.sav`` - ``lung_adrenal_best_model.h5`` - ``lung_recommend_best_model.h5`` - ``tokenizer.gz`` for the (i) Findings vs No Finding Model, (ii) Lung vs Adrenal Findings Model, (iii) Comment Extraction Model, (iv) Lung Recommended Procedure model, and (v) tokenizer, respectively. Returns: A dictionary which includes the (1) recommended procedure, (2) nodule type (if found), (3) boolean indicating if a follow-up is required, and (4) the follow-up text (i.e., text of the report that indicates the finding) as stored / referenced by the dictionary keys "procedure", "noduleType", "followUpFlag", "followUpText", respectively Example:: >>> report_txt = "a string with the radiology report text" >>> model_path = "/path/to/checkpoints/phase01/" >>> output = classifier(report_txt, model_path) >>> print("Output:") >>> [print(f" {key}:", value) for key, value in output.items()] ... Output: ... procedure: Chest CT ... noduleType: Lung ... followUpFlag: Findings Present ... followUpText: several pulmonary micronodules. follow-up in one year recommended. """ # Define tokenizers findings_tokenizer = joblib.load(os.path.join(model_path, "tokenizer.gz")) # Load models # - findings_model: model detecting if there is a finding or no finding (`findings_model_dict`) # - lung_adrenal_model: model detecting if there is a lung or adrenal finding for reports with findings (`lung_adrenal_dict`) # - comments_model: extract the relevant portion of report that mentions the finding # - lung_recommended_proc: model to determine the recommended procedure for lung findings (`lung_recommended_proc_dict`) # NOTE: Depending on your application, the models can be loaded differently / elsewhere (e.g., as globals that are always loaded) findings_model = load_model(os.path.join(model_path, "findings_best_model.h5"), compile=True) comments_model = pickle.load(open(os.path.join(model_path, "comment_best_model.sav"), "rb")) lung_adrenal_model = load_model(os.path.join(model_path, "lung_adrenal_best_model.h5"), compile=True) lung_recommended_proc = load_model(os.path.join(model_path, "lung_recommend_best_model.h5"), compile=True) # Define label embeddings / enummerations for the three models # Findings vs no findings model output findings_model_dict = { 0: "No Findings Present", 1: "Findings Present", } # Lung vs adrenal findings model output lung_adrenal_dict = { 0: "Lung", 1: "Adrenal", } # Lung recommended procedure model output lung_recommended_proc_dict = { 0: "Ambiguous", 1: "Chest CT", } # Preprocess the note, getting the impression, removing doctor signatures, etc. clean_findings_imp = preprocess_input(data, is_phase_2=False) # Tokenize the impression X = findings_tokenizer.texts_to_sequences([clean_findings_imp]) X = pad_sequences(X, maxlen=300, padding="pre") # Classify if there is a finding or not and get as layman text y_pred = findings_model.predict(X) y_pred_class = np.argmax(y_pred, axis=1) follow_up_flg = findings_model_dict[y_pred_class[0]] # Based on the finding, run through more model(s) if follow_up_flg == "Findings Present": # Finding detected; follow-up recommended # Classify if the finding the lung or adrenal lung_adrenal_pred = lung_adrenal_model.predict(X) lung_or_adrenal_finding = lung_adrenal_dict[np.argmax(lung_adrenal_pred)] # Get follow-up text final_comment = extract_comments(clean_findings_imp, comments_model) # Based on the finding type, get the procedure recommendation if lung_or_adrenal_finding == "Lung": # Lung finding run through another model to determine recommended procedure lung_recommended_proc_pred = lung_recommended_proc.predict(X) lung_adrenal_recommended_proc_label = lung_recommended_proc_dict[np.argmax(lung_recommended_proc_pred)] else: # Only one recommendation for adrenal findings lung_adrenal_recommended_proc_label = "Endocrinology Referral" else: # No finding detected; no follow-up recommended, etc. final_comment = "NA" lung_or_adrenal_finding = "NA" lung_adrenal_recommended_proc_label = "NA" # Output includes classification and recommendations output = { "procedure": lung_adrenal_recommended_proc_label, # Recommended procedure. For "Lung" finding, "Ambiguous" or "Chest CT". For "Adrenal", "Endocrinology Referral" "noduleType": lung_or_adrenal_finding, # Options defined in `finding_model_dict` "followUpFlag": follow_up_flg, # "Boolean" for findings: "Findings Present" or "NA" "followUpText": final_comment, # Follow-up text (i.e., text that indicates finding) } return output