Source code for nmrezman.phase01.classify.classifier

# %%

# Models and tokenizers
import pickle 
import joblib
import nltk
import numpy as np	
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import imblearn

# nmrezman
from ...utils import preprocess_input

# Misc
import os

# Typing
from typing import Dict
from imblearn.pipeline import Pipeline


# %%

def extract_comments(clean_note: str, comments_model: Pipeline) -> str:
    """
    For reports with findings, extract the related comment that led to the finding.

    Args:
        clean_note (`str`):
            Preprocessed note
        comments_model (`imblearn.pipeline.Pipeline`):
            Pretrained comments model
    """

    # Define the tokenizer
    nltk.download('punkt')
    nltk_token = nltk.tokenize

    # Get a list of all the tokenized sentences in the the clean note
    comments_X = nltk_token.sent_tokenize(clean_note)

    # Get the highest probability predicted comment
    comment_pred_probs = [comments_model.predict_proba([i.encode("utf-8")])[:, 1][0] for i in comments_X]
    idx = comment_pred_probs.index(max(comment_pred_probs))
    predicted_comment = comments_X[idx]

    return comments_X[idx-1] + predicted_comment


[docs]def classifier(
        data: str,
        model_path: str,
) -> Dict[str, object]:
    """
    Results Management Classifier using biLSTMs according to Phase 01 of the project.

    Args:
        data (`str`):
            Radiologist report
        model_path (`str`):
            Path to the folder with model checkpoints and tokenizer

            .. note::

                The model weights and tokenizer should be located in the specified folder as:
                    - ``findings_best_model.h5``
                    - ``comment_best_model.sav``
                    - ``lung_adrenal_best_model.h5``
                    - ``lung_recommend_best_model.h5``
                    - ``tokenizer.gz``
                for the (i) Findings vs No Finding Model, (ii) Lung vs Adrenal Findings Model, (iii) Comment Extraction 
                Model, (iv) Lung Recommended Procedure model, and (v) tokenizer, respectively.

    Returns:
        A dictionary which includes the (1) recommended procedure, (2) nodule type (if found), (3) boolean indicating
        if a follow-up is required, and (4) the follow-up text (i.e., text of the report that indicates the finding) 
        as stored / referenced by the dictionary keys "procedure", "noduleType", "followUpFlag", "followUpText", 
        respectively

    Example::

        >>> report_txt = "a string with the radiology report text"
        >>> model_path = "/path/to/checkpoints/phase01/"
        >>> output = classifier(report_txt, model_path)
        >>> print("Output:")
        >>> [print(f"  {key}:", value) for key, value in output.items()]
        ... Output:
        ...   procedure: Chest CT
        ...   noduleType: Lung
        ...   followUpFlag: Findings Present
        ...   followUpText: several pulmonary micronodules. follow-up in one year recommended.
    """  

    # Define tokenizers
    findings_tokenizer = joblib.load(os.path.join(model_path, "tokenizer.gz"))

    # Load models
    #   - findings_model:               model detecting if there is a finding or no finding (`findings_model_dict`)
    #   - lung_adrenal_model:           model detecting if there is a lung or adrenal finding for reports with findings (`lung_adrenal_dict`)
    #   - comments_model:               extract the relevant portion of report that mentions the finding
    #   - lung_recommended_proc:        model to determine the recommended procedure for lung findings (`lung_recommended_proc_dict`)
    # NOTE: Depending on your application, the models can be loaded differently / elsewhere (e.g., as globals that are always loaded)
    findings_model = load_model(os.path.join(model_path, "findings_best_model.h5"), compile=True)
    comments_model = pickle.load(open(os.path.join(model_path, "comment_best_model.sav"), "rb"))
    lung_adrenal_model = load_model(os.path.join(model_path, "lung_adrenal_best_model.h5"), compile=True)
    lung_recommended_proc = load_model(os.path.join(model_path, "lung_recommend_best_model.h5"), compile=True)

    # Define label embeddings / enummerations for the three models
    # Findings vs no findings model output
    findings_model_dict = {
        0: "No Findings Present",
        1: "Findings Present",
    }
    # Lung vs adrenal findings model output
    lung_adrenal_dict = {
        0: "Lung",
        1: "Adrenal",
    }
    # Lung recommended procedure model output
    lung_recommended_proc_dict = {
        0: "Ambiguous",
        1: "Chest CT",
    }

    # Preprocess the note, getting the impression, removing doctor signatures, etc.
    clean_findings_imp = preprocess_input(data, is_phase_2=False)
    
    # Tokenize the impression
    X = findings_tokenizer.texts_to_sequences([clean_findings_imp])
    X = pad_sequences(X, maxlen=300, padding="pre")

    # Classify if there is a finding or not and get as layman text
    y_pred = findings_model.predict(X)
    y_pred_class = np.argmax(y_pred, axis=1)
    follow_up_flg = findings_model_dict[y_pred_class[0]]

    # Based on the finding, run through more model(s)
    if follow_up_flg == "Findings Present":
        # Finding detected; follow-up recommended
        
        # Classify if the finding the lung or adrenal
        lung_adrenal_pred = lung_adrenal_model.predict(X)
        lung_or_adrenal_finding = lung_adrenal_dict[np.argmax(lung_adrenal_pred)]
        
        # Get follow-up text
        final_comment = extract_comments(clean_findings_imp, comments_model)		

        # Based on the finding type, get the procedure recommendation
        if lung_or_adrenal_finding == "Lung":
            # Lung finding run through another model to determine recommended procedure
            lung_recommended_proc_pred = lung_recommended_proc.predict(X)
            lung_adrenal_recommended_proc_label = lung_recommended_proc_dict[np.argmax(lung_recommended_proc_pred)]
        else:
            # Only one recommendation for adrenal findings
            lung_adrenal_recommended_proc_label = "Endocrinology Referral"
            
    else:
        # No finding detected; no follow-up recommended, etc.
        final_comment = "NA"
        lung_or_adrenal_finding = "NA"
        lung_adrenal_recommended_proc_label = "NA"


    # Output includes classification and recommendations	
    output = {
        "procedure": lung_adrenal_recommended_proc_label,	# Recommended procedure. For "Lung" finding, "Ambiguous" or "Chest CT". For "Adrenal", "Endocrinology Referral"
        "noduleType": lung_or_adrenal_finding,				# Options defined in `finding_model_dict`
        "followUpFlag": follow_up_flg,						# "Boolean" for findings: "Findings Present" or "NA"
        "followUpText": final_comment,						# Follow-up text (i.e., text that indicates finding)
    }

    return output