MTBM-Machine-Learning/load_protocol_pdf.py at master · abdinzaghi5601/MTBM-Machine-Learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#!/usr/bin/env python3
"""
Load MTBM Data from Protocol PDF
=================================

This script extracts data from MTBM protocol PDF files.

Your PDF file: C:\\Users\\abdul\\Desktop\\ML for Tunneling\\3000 Measure Protocol.pdf

Usage:
    python load_protocol_pdf.py
"""

import pandas as pd
import numpy as np
import sys
import os

# Try to import PDF reader
try:
    import PyPDF2
    PDF_AVAILABLE = True
except ImportError:
    PDF_AVAILABLE = False
    print("⚠️ PyPDF2 not installed. Install with: pip install PyPDF2")

try:
    import pdfplumber
    PDFPLUMBER_AVAILABLE = True
except ImportError:
    PDFPLUMBER_AVAILABLE = False
    print("⚠️ pdfplumber not installed. Install with: pip install pdfplumber")

try:
    import tabula
    TABULA_AVAILABLE = True
except ImportError:
    TABULA_AVAILABLE = False
    print("⚠️ tabula-py not installed. Install with: pip install tabula-py")


def extract_tables_from_pdf(pdf_path):
    """
    Extract tables from PDF using multiple methods

    Args:
        pdf_path: Path to your PDF file

    Returns:
        List of DataFrames (one per table found)
    """

    print("=" * 70)
    print("📄 EXTRACTING DATA FROM PDF")
    print("=" * 70)
    print(f"\nFile: {pdf_path}")

    if not os.path.exists(pdf_path):
        print(f"\n❌ Error: File not found!")
        print(f"Looking for: {pdf_path}")
        print("\nPlease check:")
        print("1. File path is correct")
        print("2. File exists in the specified location")
        return None

    all_tables = []

    # Method 1: Try tabula-py (best for structured tables)
    if TABULA_AVAILABLE:
        print("\n🔧 Method 1: Using tabula-py...")
        try:
            tables = tabula.read_pdf(
                pdf_path,
                pages='all',
                multiple_tables=True,
                lattice=True  # For tables with visible borders
            )

            if tables:
                print(f"✅ Found {len(tables)} tables with tabula-py")
                all_tables.extend(tables)
            else:
                print("⚠️ No tables found with tabula-py")

        except Exception as e:
            print(f"⚠️ tabula-py error: {e}")

    # Method 2: Try pdfplumber (good for various PDF types)
    if PDFPLUMBER_AVAILABLE and len(all_tables) == 0:
        print("\n🔧 Method 2: Using pdfplumber...")
        try:
            import pdfplumber

            with pdfplumber.open(pdf_path) as pdf:
                for i, page in enumerate(pdf.pages, 1):
                    print(f"   Scanning page {i}...")

                    tables = page.extract_tables()

                    if tables:
                        print(f"   ✅ Found {len(tables)} tables on page {i}")

                        for table in tables:
                            if table:
                                df = pd.DataFrame(table[1:], columns=table[0])
                                all_tables.append(df)

        except Exception as e:
            print(f"⚠️ pdfplumber error: {e}")

    # Method 3: Extract text and parse manually
    if len(all_tables) == 0 and PDF_AVAILABLE:
        print("\n🔧 Method 3: Extracting text from PDF...")
        print("⚠️ No structured tables found.")
        print("   You may need to manually export data from PDF to Excel/CSV")

        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)

                print(f"\nPDF Info:")
                print(f"   Pages: {len(pdf_reader.pages)}")

                # Extract text from first 2 pages
                print("\nFirst page preview:")
                print("-" * 70)
                text = pdf_reader.pages[0].extract_text()
                print(text[:500])  # First 500 characters
                print("-" * 70)

        except Exception as e:
            print(f"⚠️ Text extraction error: {e}")

    if len(all_tables) > 0:
        print(f"\n✅ Total tables extracted: {len(all_tables)}")
        return all_tables
    else:
        print("\n❌ Could not extract tables from PDF")
        print("\n💡 Recommended Solutions:")
        print("1. Convert PDF to Excel manually and use load_real_data.py")
        print("2. Copy-paste data from PDF to Excel")
        print("3. Use PDF software to export tables")
        return None


def load_protocol_pdf_manual():
    """
    Guide for manually loading protocol data
    """

    print("\n" + "=" * 70)
    print("📋 MANUAL DATA EXTRACTION GUIDE")
    print("=" * 70)

    print("\nYour protocol PDF contains measurement data.")
    print("Here's how to get it into the ML system:\n")

    print("OPTION 1: Convert PDF to Excel (Recommended)")
    print("-" * 70)
    print("1. Open your PDF: '3000 Measure Protocol.pdf'")
    print("2. Use Adobe Acrobat (if available):")
    print("   File → Export To → Spreadsheet → Microsoft Excel")
    print("\n3. Or use online converter:")
    print("   - https://www.adobe.com/acrobat/online/pdf-to-excel.html")
    print("   - https://www.ilovepdf.com/pdf_to_excel")
    print("\n4. Save as: 'AVN3000_Data.xlsx'")
    print("5. Then run: python load_real_data.py")

    print("\n\nOPTION 2: Manual Copy-Paste")
    print("-" * 70)
    print("1. Open PDF and select data table")
    print("2. Copy the data (Ctrl+C)")
    print("3. Open Excel and paste (Ctrl+V)")
    print("4. Clean up the data:")
    print("   - Remove header rows if needed")
    print("   - Ensure columns are properly separated")
    print("   - Save as 'AVN3000_Data.xlsx'")
    print("5. Run: python load_real_data.py")

    print("\n\nOPTION 3: Use Tabula (Free Software)")
    print("-" * 70)
    print("1. Download Tabula: https://tabula.technology/")
    print("2. Open your PDF in Tabula")
    print("3. Select the data table")
    print("4. Export as CSV")
    print("5. Run: python load_real_data.py")

    print("\n" + "=" * 70)


def main():
    """Main function"""

    print("\n" + "=" * 70)
    print("🚀 MTBM PROTOCOL PDF LOADER")
    print("=" * 70)

    # ================================================
    # YOUR PDF FILE PATH - EDIT THIS!
    # ================================================

    pdf_file = r"C:\Users\abdul\Desktop\ML for Tunneling\3000 Measure Protocol.pdf"

    # Alternative: Use relative path if PDF is in same folder
    # pdf_file = "3000 Measure Protocol.pdf"

    # ================================================

    print(f"\nTarget PDF: {pdf_file}")

    # Check if PDF libraries are available
    if not (TABULA_AVAILABLE or PDFPLUMBER_AVAILABLE or PDF_AVAILABLE):
        print("\n❌ No PDF libraries installed!")
        print("\nPlease install at least one:")
        print("   pip install tabula-py")
        print("   pip install pdfplumber")
        print("   pip install PyPDF2")
        print("\nOr use manual extraction (see guide below)")
        load_protocol_pdf_manual()
        return

    # Try to extract tables
    tables = extract_tables_from_pdf(pdf_file)

    if tables:
        print("\n" + "=" * 70)
        print("📊 EXTRACTED TABLES")
        print("=" * 70)

        for i, df in enumerate(tables, 1):
            print(f"\nTable {i}:")
            print(f"   Shape: {df.shape}")
            print(f"   Columns: {list(df.columns)}")
            print(f"\nFirst few rows:")
            print(df.head())

            # Save table
            output_file = f"protocol_table_{i}.csv"
            df.to_csv(output_file, index=False)
            print(f"\n💾 Saved to: {output_file}")

        print("\n" + "=" * 70)
        print("✅ NEXT STEPS")
        print("=" * 70)
        print("\n1. Review the extracted CSV files")
        print("2. Choose the file with your measurement data")
        print("3. Edit load_real_data.py to use that CSV file:")
        print("   USE_CSV = True")
        print("   csv_file = 'protocol_table_1.csv'")
        print("4. Run: python load_real_data.py")

    else:
        # Show manual guide
        load_protocol_pdf_manual()

        print("\n" + "=" * 70)
        print("📝 QUICK START TEMPLATE")
        print("=" * 70)
        print("\nCreate a CSV/Excel file with these columns:")
        print("-" * 70)

        # Create template
        template = pd.DataFrame({
            'Date': ['2024-01-01', '2024-01-02', '2024-01-03'],
            'Chainage': [10.5, 11.2, 12.8],
            'Ground_Type': ['Clay', 'Sand', 'Clay'],
            'Thrust_kN': [1250, 1450, 1300],
            'Torque_kNm': [210, 245, 220],
            'RPM': [8.5, 8.2, 8.7],
            'Speed_mm_min': [35.2, 28.3, 34.1],
            'Pressure_bar': [130, 145, 135]
        })

        print(template)

        # Save template
        template.to_csv('mtbm_data_template.csv', index=False)
        template.to_excel('mtbm_data_template.xlsx', index=False)

        print("\n💾 Template files created:")
        print("   - mtbm_data_template.csv")
        print("   - mtbm_data_template.xlsx")
        print("\nFill this template with your protocol data!")
        print("Then run: python load_real_data.py")


if __name__ == "__main__":
    main()