Source code for nmrezman.phase01.train.general

# %%

# Models and tokenizers
import joblib
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K
from tensorflow.keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import load_model
import fasttext

import nltk
from xgboost import XGBClassifier
from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score

# nmrezman
from ..models import get_bilstm_findings_classifier, get_bilstm_lung_adrenal_classifier, recommended_proc_model
from ...utils import generate_eval_report

# Misc
import os
import numpy as np
from tqdm import tqdm
import pandas as pd
import pickle

# Typing
from typing import Tuple, Union
import numpy.typing as npt
from keras.models import Sequential


# %%

def tokenize(
        x: pd.Series,
        max_num_words: int,
        max_sequence_length: int,
        tokenizer_fname: str=None,
        is_create: bool=True,
    ) -> Tuple[npt.NDArray, dict, int]:
    
    # Define the tokenizer
    # Lowercase the text; filter out special characters 
    if is_create:
        tokenizer = Tokenizer(num_words=max_num_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
        tokenizer.fit_on_texts(x)
    else:
        tokenizer = joblib.load(tokenizer_fname)
    word_index = tokenizer.word_index
    vocab_size = len(word_index)+1

    # Tokenize the notes
    # Prepend since radiology fidings are almost always located in the last section of the report
    x_tokenized = tokenizer.texts_to_sequences(x)
    x_tokenized = pad_sequences(x_tokenized, maxlen=max_sequence_length, padding="pre")

    # Save the tokenizer, which will be needed for classification and training other models
    if is_create:
        os.makedirs(os.path.dirname(tokenizer_fname), exist_ok=True)
        joblib.dump(tokenizer, tokenizer_fname)

    return x_tokenized, word_index, vocab_size


def train(
        x_tokenized: npt.NDArray,
        y: Union[list, npt.NDArray],
        model: Sequential,
        model_checkpoint_name: str="./output/best_model.h5",
        epochs: int=50,
        class_weight: dict=None,
        result_fname: str="./output/validation.log",
    ) -> None:
    """
    Train a Keras model
    """

    # Make dirs before wasting time running model
    os.makedirs(os.path.dirname(model_checkpoint_name), exist_ok=True)
    os.makedirs(os.path.dirname(result_fname), exist_ok=True)

    # Split the data into train and test
    train_x, test_x, train_y, test_y = train_test_split(x_tokenized, y, test_size=0.30, random_state=133278)

    # Clear the Keras backend, starting model training from scratch
    K.clear_session()

    # Train!
    batch_size = 100
    es = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=15,)
    mc = ModelCheckpoint(model_checkpoint_name, monitor="val_loss", mode="min", verbose=1, save_best_only=True,)
    model.fit(
        train_x,
        to_categorical(train_y),
        class_weight=class_weight,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[es, mc],
        verbose=1,
        validation_data=(test_x, to_categorical(test_y)),
    )

    # Load in the best model
    best_model = load_model(model_checkpoint_name)

    # Perform confusion matrix and save the results
    y_pred = best_model.predict(np.array(test_x))
    y_pred = np.argmax(y_pred, axis=1)
    generate_eval_report(test_y, y_pred, result_fname)

    return


# %% [markdown]
# # Train Findings vs No Findings Model

# %%

[docs]def train_findings_model( data_path: str, glove_embedding_path: str, model_checkpoint_name: str="findings_best_model.h5", result_fname: str="findings_best_result.log", tokenizer_fname: str="tokenizer.gz" ) -> None: """ Trains the Findings vs No Findings Phase 01 BiLSTM model. Args: data_path (`str`): Path to the dataframe file with the preprocessed impressions and labels in ``new_note`` and ``selected_finding`` columns, respectively glove_embedding_path (`str`): Path to the pre-downloaded GloVe Stanford pretrained word vectors ``glove.6B.300d`` as found at https://nlp.stanford.edu/projects/glove/ model_checkpoint_name (`str`): Path / filename to save model checkpoints result_fname (`str`): Path / filename to save model evaluation metrics tokenizer_fname (`str`): Path / filename to save tokenizer """ # Import data # NOTE: this data has already been preprocessed, extracting the findings, removing Dr signature, etc. # See `from ..utils import preprocess_input` modeling_df = joblib.load(data_path) # Get preprocessed notes and labels as already indicated by the dataframe X = modeling_df["new_note"] labels = [0 if i == "No Findings" else 1 for i in modeling_df["selected_finding"]] # Define model constants max_sequence_length = 300 # Max length of report. Avg NM is ~250 max_num_words = 15000 # Max number of words for init vocab; the actual vocab size used by the model will be different glove_embedding_dim = 300 # GloVe embedding dimension size # Tokenize the data X_tokenized, word_index, vocab_size = tokenize( x=X, max_num_words=max_num_words, max_sequence_length=max_sequence_length, tokenizer_fname=tokenizer_fname, is_create=True, ) # Get GloVe embedding matrix # NOTE: Stanford pretrained word vectors glove.6B.300d were downloaded from https://nlp.stanford.edu/projects/glove/ glove_embeddings_index = {} f = open(glove_embedding_path, encoding="utf8") for line in f: values = line.split() word = values[0] try: coefs = np.asarray(values[1:], dtype="float32") except: pass glove_embeddings_index[word] = coefs f.close() glove_embedding_matrix = np.random.random((len(word_index) + 1, glove_embedding_dim)) for word, i in word_index.items(): glove_embedding_vector = glove_embeddings_index.get(word) if glove_embedding_vector is not None: # words not found in embedding index will be all-zeros. if len(glove_embedding_matrix[i]) != len(glove_embedding_vector): print("could not broadcast input array from shape", str(len(glove_embedding_matrix[i])), "into shape", str(len(glove_embedding_vector)), " Please make sure your" " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,") exit(1) glove_embedding_matrix[i] = glove_embedding_vector # Define the model model = get_bilstm_findings_classifier( max_sequence_length=max_sequence_length, max_num_words=vocab_size, glove_embedding_dim=glove_embedding_dim, glove_embedding_matrix=glove_embedding_matrix, ) # Train and evaluate train( x_tokenized=X_tokenized, y=labels, model=model, model_checkpoint_name=model_checkpoint_name, epochs=30, result_fname=result_fname, ) return
# %% [markdown] # # Train Lung vs Adrenal Findings Model # %%
[docs]def train_lung_adrenal_model( data_path: str, bioword_path: str, model_checkpoint_name: str="lung_adrenal_best_model.h5", result_fname: str="lung_adrenal_best_result.log", tokenizer_fname: str="tokenizer.gz" ) -> None: """ Trains the Lung vs Adrenal Findings Phase 01 BiLSTM model. Args: data_path (`str`): Path to the dataframe file with the preprocessed impressions and labels in ``new_note`` and ``selected_finding`` columns, respectively bioword_path (`str`): Path to the BioWordVec pretrained word vectors ``BioWordVec_PubMed_MIMICIII_d200.bin`` as from https://ftp.ncbi.nlm.nih.gov/pub/lu/Suppl/BioSentVec/BioWordVec_PubMed_MIMICIII_d200.bin model_checkpoint_name (`str`): Path / filename to save model checkpoints result_fname (`str`): Path / filename to save model evaluation metrics tokenizer_fname (`str`): Path / filename to save tokenizer """ # Import data # NOTE: this data has already been preprocessed, extracting the findings, removing Dr signature, etc. # See `from ..utils import preprocess_input` modeling_df = joblib.load(data_path) # Get preprocessed notes and labels as already indicated by the dataframe X = modeling_df[modeling_df["selected_finding"]!="No Findings"]["new_note"] labels = modeling_df[modeling_df["selected_finding"]!="No Findings"]["selected_finding"] labels = [0 if i =="Lung Findings" else 1 for i in labels] # Define model constants max_sequence_length = 300 # Max length of report. Avg NM is ~250 max_num_words = 20000 # Max number of words for init vocab; the actual vocab size used by the model will be different bioword_embedding_dim = 200 # Bioword embedding dimension size # Tokenize the data X_tokenized, word_index, vocab_size = tokenize( x=X, max_num_words=max_num_words, max_sequence_length=max_sequence_length, tokenizer_fname=tokenizer_fname, is_create=False, ) # Load the model for (bio) word embedding # NOTE: bioword word vector downloaded from: https://ftp.ncbi.nlm.nih.gov/pub/lu/Suppl/BioSentVec/BioWordVec_PubMed_MIMICIII_d200.bin model = fasttext.load_model(bioword_path) # Prepare the word embedding matrix num_words = min(len(word_index) + 1, max_num_words) bioword_embedding_matrix = np.zeros((num_words, bioword_embedding_dim)) for word, i in tqdm(word_index.items()): if i >= max_num_words: continue bioword_embedding_matrix[i] = model.get_word_vector(word) # Define the model model = get_bilstm_lung_adrenal_classifier( max_num_words=vocab_size, bioword_embedding_dim=bioword_embedding_dim, max_sequence_length=max_sequence_length, bioword_embedding_matrix=bioword_embedding_matrix, ) # Train and evaluate train( x_tokenized=X_tokenized, y=labels, model=model, model_checkpoint_name=model_checkpoint_name, epochs=50, class_weight={0:1, 1:100}, result_fname=result_fname, ) return
# %% [markdown] # # Train Lung Recommended Procedure (Chest CT or Ambiguous) Model # %% # %% [markdown] # # Train Comment Extraction Model # %%
[docs]def train_comment_model( data_path: str, model_checkpoint_name: str="comment_best_model.sav", result_fname: str="comment_best_result.log", ) -> None: """ Trains the Comment Extraction Phase 01 XGBoost model. Args: data_path (`str`): Path to the dataframe file with the preprocessed impressions and labels in ``new_note`` and ``selected_finding`` columns, respectively model_checkpoint_name (`str`): Path / filename to save model checkpoints result_fname (`str`): Path / filename to save model evaluation metrics """ # Import data # NOTE: this data has already been preprocessed, extracting the findings, removing Dr signature, etc. # See `from ..utils import preprocess_input` modeling_df = joblib.load(data_path) # Get the portion of the dataset that includes only findings only_findings_df = modeling_df[modeling_df["selected_finding"]!="No Findings"] # Split into train and test data train, hold_out = train_test_split(only_findings_df, test_size=0.2) # Tokenize into sentences. Model training is done on sentences vs the whole report nltk.download("punkt") main_row_sents = [] sent_classifier=[] rpt_num=[] def get_sentence_classification_data(train): for idx, row in tqdm(train.iterrows()): row_sents = nltk.tokenize.sent_tokenize(row["note"]) last_sentence_label = nltk.tokenize.sent_tokenize(row["selected_label"])[-1] for jdx, ele in enumerate(row_sents): if ele in last_sentence_label: classifier = 1 else: classifier = 0 sent_classifier.append(classifier) rpt_num.append(row["rpt_num"]) main_row_sents.append(row_sents) return main_row_sents, sent_classifier, rpt_num main_row_sents, classifier, rpt_num = get_sentence_classification_data(train) # Create a dataframe sentence classifier flattened_sents = [i for sublist in main_row_sents for i in sublist] sent_class_df = pd.DataFrame() sent_class_df["sentence"] = flattened_sents sent_class_df["finding_sent"] = classifier sent_class_df["rpt_num"] = rpt_num # Get matrix of counts y = np.array(sent_class_df["finding_sent"]) my_stop_words = ["the", "is", "are", "a" "there", "for", "in"] tvec = TfidfVectorizer(stop_words=my_stop_words, max_features=1000, ngram_range=(1,3)) cvec = CountVectorizer(stop_words=my_stop_words, ngram_range=(1,4)) # Define XGBoost classifier xgb = XGBClassifier(eval_metric="error", use_label_encoder=False) sent_class_df["X"] = sent_class_df["sentence"] # Get the XGBoost pipeline and train print("XgBoost results") print("+"*100) RUS_pipeline = make_pipeline(tvec, RandomUnderSampler(random_state=777), xgb) lr_cv(5, sent_class_df.X, sent_class_df.finding_sent, RUS_pipeline, "macro", model_checkpoint_name, result_fname) return
def lr_cv( splits: int, X: pd.Series, Y: pd.Series, pipeline: Pipeline, average_method: str, model_checkpoint_name: str, result_fname: str, ) -> None: """ Trains the comment extraction model Args: splits (`int`): Number of folds X (`pandas.Series`): Series of train and test X data Y (`pandas.Series`): Series of train and test Y data pipeline (`imblearn.pipeline.Pipeline`): Pipeline of transforms and resamples with a final estimator average_method (`str`): Type of averaging performed on the data model_checkpoint_name (`str`): Path / filename to save model checkpoints result_fname (`str`): Path / filename to save model evaluation metrics """ # Train and evaluate! kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777) accuracy = [] precision = [] recall = [] finding_recall =[] no_finding_recall=[] finding_precision=[] no_finding_precision =[] f1 = [] f1_all = [] for train, test in kfold.split(X, Y): lr_fit = pipeline.fit(X[train], Y[train]) prediction = lr_fit.predict(X[test]) scores = lr_fit.score(X[test], Y[test]) print(" no_finding finding_present ") accuracy.append(scores*100) p_score = precision_score(Y[test], prediction, average=None) print("precision:", p_score) precision.append(precision_score(Y[test], prediction, average=average_method)*100) finding_precision.append(p_score[1]) no_finding_precision.append(p_score[0]) r_score = recall_score(Y[test], prediction, average=None) print("recall: ", r_score) recall.append(recall_score(Y[test], prediction, average=average_method)*100) finding_recall.append(r_score[1]) no_finding_recall.append(r_score[0]) f1.append(f1_score(Y[test], prediction, average=average_method)*100) f_score = f1_score(Y[test], prediction, average=None) f1_all.append(f_score) print("f1 score: ", f_score) print("-"*50) # Save the model os.makedirs(os.path.dirname(model_checkpoint_name), exist_ok=True) pickle.dump(pipeline, open(model_checkpoint_name, "wb")) # joblib.dump(pipeline, open(model_checkpoint_name, "wb")) # Print a summary of the results print("accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(accuracy), np.std(accuracy))) print("precision: %.2f%% (+/- %.2f%%)" % (np.mean(precision), np.std(precision))) print("recall: %.2f%% (+/- %.2f%%)" % (np.mean(recall), np.std(recall))) print("f1 score: %.2f%% (+/- %.2f%%)" % (np.mean(f1), np.std(f1))) print("Finding Recall: %.2f%%" % (np.mean(finding_recall)*100)) print("No Finding Recall: %.2f%%" % (np.mean(no_finding_recall)*100)) print("Finding Precision: %.2f%%" % (np.mean(finding_precision)*100)) print("No Finding Precision: %.2f%%" % (np.mean(no_finding_precision)*100)) # Write results out to a file with open(result_fname, "w") as fh: fh.write("Classification Report:") fh.write("\n\taccuracy: %.2f%% (+/- %.2f%%)" % (np.mean(accuracy), np.std(accuracy))) fh.write("\n\tprecision: %.2f%% (+/- %.2f%%)" % (np.mean(precision), np.std(precision))) fh.write("\n\trecall: %.2f%% (+/- %.2f%%)" % (np.mean(recall), np.std(recall))) fh.write("\n\tf1 score: %.2f%% (+/- %.2f%%)" % (np.mean(f1), np.std(f1))) fh.write("\n\tFinding Recall: %.2f%%" % (np.mean(finding_recall)*100)) fh.write("\n\tNo Finding Recall: %.2f%%" % (np.mean(no_finding_recall)*100)) fh.write("\n\tFinding Precision: %.2f%%" % (np.mean(finding_precision)*100)) fh.write("\n\tNo Finding Precision: %.2f%%" % (np.mean(no_finding_precision)*100)) fh.write("\n\nAll Results:") for precision_no_finding, precision_finding, recall_no_finding, recall_finding, f1sc in zip(no_finding_precision, finding_precision, no_finding_recall, finding_recall, f1_all): fh.write("\n\t no_finding finding_present ") fh.write(f"\n\tprecision: [{precision_no_finding:0.8f} {precision_finding:0.8f}]") fh.write(f"\n\trecall: [{recall_no_finding:0.8f} {recall_finding:0.8f}]") fh.write(f"\n\tf1 score: {f1sc}") fh.write("\n\t"+"-"*50) return # %%