SIIM_Prompter/RunLLM_ollama.py at main · Mayo-Radiology-Informatics-Lab/SIIM_Prompter · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from radprompter import Prompt, RadPrompter, vLLMClient, OllamaClient, OpenAIClient
import os
import pandas as pd
import numpy as np
import pandas as pd

MODEL = 'mixtral8x22b'
INPUT_FILE = '~/Desktop/SIIMCombinedReports.xlsx'
OUTPUT_FILE = '~/Desktop/SIIM_Results-' + MODEL + '.csv'
TEMP_OUT = '~/Desktop/output-' + MODEL + '.csv'

if __name__ == '__main__':

    prompt = Prompt("SIIM.toml")
    client = OllamaClient(
        model="llama3",
        base_url="http://localhost:11434/v1",
        temperature=0.0,
        seed=42
    )
    # delete any prior output
    if os.path.exists(TEMP_OUT):
        os.remove(TEMP_OUT)


    engine = RadPrompter(
        client=client,
        prompt=prompt,
        output_file=TEMP_OUT,
    )

    # Load the Excel file into a DataFrame
    reports_df = pd.read_excel(INPUT_FILE)
    # strip spaces out of the FIndings column
    reports_df['Findings'] = reports_df['Findings'].str.replace(' ', '')

    reports_df['Report'] = reports_df['Report'].str.replace('\n', '')
    reports_df['Report'] = reports_df['Report'].str.replace('_0x000D_', '')
    reports_df['Report'] = reports_df['Report'].str.replace('    ', '')
    reports_df['Report'] = reports_df['Report'].str.replace('  ', '')

    reports_df = reports_df.replace({np.nan: 'No', 'None': 'No'})

    reports_df

    #---

    # Splitting the reports_df into separate dataframes based on the 'ExamClass' column

    # Creating a dictionary to hold the dataframes for each category
    categories = reports_df['ExamClass'].unique()
    print (categories)
    dfs = {category: reports_df[reports_df['ExamClass'] == category] for category in categories}

    # Now dfs dictionary contains separate dataframes for each category in 'ExamClass'
    # For example, to access the dataframe for 'Cervical Spine Fracture', you can use dfs['Cervical Spine Fracture']
    dfs['Cervical Spine Fracture']

    #---

    # Summing up the number of rows with 'None' and not 'None' in the 'Findings' column for each category

    # Initialize a dictionary to store the results
    category_summary = {}

    # Iterate over each category dataframe
    for category, df in dfs.items():
        unique_values = df['Findings'].unique()  # Get unique values in 'Findings' column
        unique_counts = df['Findings'].value_counts()  # Count the number of each unique value
        total_count = len(df)  # Total number of rows
        category_summary[category] = {
            'Unique_Values': unique_values,
            'Unique_Counts': unique_counts,
            'Total': total_count
        }
    # Print the results for each column
    for category, counts in category_summary.items():
        print(f"Category: {category}")
        for value, count in counts['Unique_Counts'].items():
            print(f"{value}: {count} {count*100//counts['Total']}%")
        print(f"Total: {counts['Total']}")
        print()

    #---

    # Extract all reports from the 'Report' column and clean them by removing extra whitespace and blank lines
    reports = [{'report': report.strip(), 'filename': category} for report, category in zip(reports_df['Report'], reports_df['ExamClass']) if report.strip()]

    #---


    print ('Doing inference...')
    #out=engine(reports)

    #---


    output_df = pd.read_csv(TEMP_OUT, index_col='index')
    # rename the colume in output_df from 'filename' to 'ExamClass'
    out_df = output_df.rename(columns={'filename': 'ExamClass'})

    # Delete the column with reports
    out_df.drop(columns=['report'], inplace=True, axis=1)
    # Merge the 'Findings' column from reports_df into output_df
    out_df = out_df.join(reports_df['Findings'])


    #---

    if os.path.exists(OUTPUT_FILE):
        os.remove(OUTPUT_FILE)
    # Write the combined dataframe to a CSV fil
    out_df.to_csv(OUTPUT_FILE)

    print('Examine the output file to assure no reports or other PHI. Please send this file back to BJE@mayo.edu')


##########################################################################
# This file was converted using nb2py: https://github.com/BardiaKh/nb2py #
##########################################################################