-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsnapshot_manager.py
More file actions
executable file
·404 lines (315 loc) · 15.8 KB
/
snapshot_manager.py
File metadata and controls
executable file
·404 lines (315 loc) · 15.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
#!/usr/bin/env python3
"""
Snapshot Manager - Local BrightData Snapshot Management Utility
This script provides a comprehensive interface for managing local snapshot records:
- View all snapshot records and their status
- Check current status from BrightData API
- Initiate downloads for ready snapshots
- View downloaded data locally
- Monitor progress of processing snapshots
Usage:
python snapshot_manager.py [options]
Options:
--list, -l List all snapshot records
--status, -s Check status of all snapshots
--download, -d Download ready snapshots
--view, -v View downloaded data
--monitor, -m Monitor processing snapshots
--help, -h Show this help message
"""
import os
import json
import sys
import argparse
import requests
from datetime import datetime
from typing import Dict, List, Any, Optional
from pathlib import Path
# Add util to path for imports
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from util import BrightDataFilter, get_brightdata_api_key
class SnapshotManager:
"""Manages local snapshot records and BrightData API interactions"""
def __init__(self, storage_dir: str = "snapshot_records"):
"""
Initialize the snapshot manager
Args:
storage_dir: Directory containing snapshot records
"""
self.storage_dir = Path(storage_dir)
self.api_key = get_brightdata_api_key()
if not self.storage_dir.exists():
print(f"❌ Storage directory '{storage_dir}' not found")
print("💡 Run some filters first to create snapshot records")
sys.exit(1)
def list_records(self) -> List[Dict[str, Any]]:
"""List all local snapshot records"""
records = []
for json_file in self.storage_dir.glob("*.json"):
try:
with open(json_file, 'r') as f:
record = json.load(f)
metadata = record.get("metadata") or {}
records.append({
"snapshot_id": record.get("snapshot_id"),
"submission_time": record.get("submission_time"),
"status": record.get("status"),
"dataset_id": record.get("dataset_id"),
"records_limit": record.get("records_limit"),
"completion_time": record.get("completion_time"),
"cost": metadata.get("cost", "N/A"),
"file_path": str(json_file)
})
except Exception as e:
print(f"⚠️ Error reading {json_file}: {e}")
# Sort by submission time (newest first)
records.sort(key=lambda x: x.get("submission_time", ""), reverse=True)
return records
def print_records_table(self, records: List[Dict[str, Any]]):
"""Print records in a formatted table"""
if not records:
print("📭 No snapshot records found")
return
print(f"\n📊 Found {len(records)} snapshot records:")
print("=" * 120)
print(f"{'Snapshot ID':<20} {'Status':<12} {'Dataset':<20} {'Limit':<8} {'Cost':<8} {'Submitted':<20}")
print("=" * 120)
for record in records:
snapshot_id = record["snapshot_id"][:18] + "..." if len(record["snapshot_id"]) > 18 else record["snapshot_id"]
status = record["status"]
dataset_id = record["dataset_id"][:18] + "..." if len(record["dataset_id"]) > 18 else record["dataset_id"]
limit = str(record["records_limit"])
cost = f"${record['cost']}" if record["cost"] != "N/A" else "N/A"
submitted = record["submission_time"][:19] if record["submission_time"] else "N/A"
# Color coding for status
status_colors = {
"ready": "✅",
"failed": "❌",
"scheduled": "⏳",
"building": "🔨",
"submitted": "📤"
}
status_icon = status_colors.get(status, "❓")
print(f"{snapshot_id:<20} {status_icon} {status:<10} {dataset_id:<20} {limit:<8} {cost:<8} {submitted:<20}")
print("=" * 120)
def check_status(self, snapshot_id: str) -> Dict[str, Any]:
"""Check current status of a snapshot from BrightData API"""
try:
# Get dataset ID from local record
record_path = self.storage_dir / f"{snapshot_id}.json"
if not record_path.exists():
raise FileNotFoundError(f"No local record for {snapshot_id}")
with open(record_path, 'r') as f:
record = json.load(f)
dataset_id = record["dataset_id"]
# Initialize filter for API calls
filter_obj = BrightDataFilter(dataset_id, str(self.storage_dir), self.api_key)
# Get current metadata
metadata = filter_obj.get_snapshot_metadata(snapshot_id)
# Update local record
filter_obj.update_snapshot_record(snapshot_id, metadata=metadata)
return metadata
except Exception as e:
return {"error": str(e)}
def check_all_status(self):
"""Check status of all snapshots"""
records = self.list_records()
print(f"\n🔍 Checking status of {len(records)} snapshots...")
for i, record in enumerate(records, 1):
snapshot_id = record["snapshot_id"]
print(f"\n[{i}/{len(records)}] Checking {snapshot_id}...")
metadata = self.check_status(snapshot_id)
if "error" in metadata:
print(f"❌ Error: {metadata['error']}")
else:
status = metadata.get("status", "unknown")
cost = metadata.get("cost", "N/A")
dataset_size = metadata.get("dataset_size", "N/A")
file_size = metadata.get("file_size", "N/A")
print(f"📊 Status: {status}")
print(f"💰 Cost: ${cost}")
print(f"📈 Records: {dataset_size}")
print(f"💾 Size: {file_size} bytes")
if status == "ready":
print("✅ Ready for download!")
elif status == "failed":
print(f"❌ Failed: {metadata.get('error', 'Unknown error')}")
elif status in ["scheduled", "building"]:
print("⏳ Still processing...")
def download_snapshot(self, snapshot_id: str, output_dir: str = "data/downloads") -> bool:
"""
Download a ready snapshot
Args:
snapshot_id: Snapshot ID to download
output_dir: Directory to save downloaded files
Returns:
True if download successful, False otherwise
"""
try:
# Check if snapshot is ready
metadata = self.check_status(snapshot_id)
if "error" in metadata:
print(f"❌ Error checking status: {metadata['error']}")
return False
if metadata.get("status") != "ready":
print(f"❌ Snapshot not ready. Status: {metadata.get('status')}")
return False
# Try direct download first using the Snapshot Content API
print("📥 Attempting direct download using Snapshot Content API...")
# Get dataset_id from local record
record_path = self.storage_dir / f"{snapshot_id}.json"
if not record_path.exists():
print(f"❌ No local record found for {snapshot_id}")
return False
with open(record_path, 'r') as f:
record = json.load(f)
dataset_id = record["dataset_id"]
try:
# Initialize filter for API calls
filter_obj = BrightDataFilter(dataset_id, str(self.storage_dir), self.api_key)
# Try direct download using the Snapshot Content API
response = filter_obj.download_snapshot_content(snapshot_id, format="json")
# Create output directory
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
# Save file
filename = f"{snapshot_id}.json"
file_path = output_path / filename
with open(file_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"✅ Downloaded to: {file_path}")
# Update local record with download info
record["downloaded_file"] = str(file_path)
record["download_time"] = datetime.now().isoformat()
with open(record_path, 'w') as f:
json.dump(record, f, indent=2)
return True
except Exception as e:
print(f"❌ Direct download failed: {e}")
print("📤 Falling back to deliver snapshot method...")
# Fallback to deliver snapshot method
try:
delivery_config = {
"deliver": {
"type": "webhook",
"filename": {
"template": f"snapshot_{snapshot_id}",
"extension": "json"
},
"endpoint": "https://httpbin.org/post" # Temporary endpoint for testing
},
"compress": False
}
delivery_result = filter_obj.deliver_snapshot(snapshot_id, delivery_config)
print(f"✅ Delivery initiated: {delivery_result}")
print("⏳ Please wait for delivery to complete, then try downloading again")
return False
except Exception as delivery_error:
print(f"❌ Failed to initiate delivery: {delivery_error}")
return False
except Exception as e:
print(f"❌ Download failed: {e}")
return False
def view_downloaded_data(self, snapshot_id: str, max_lines: int = 20):
"""View downloaded data for a snapshot"""
record_path = self.storage_dir / f"{snapshot_id}.json"
if not record_path.exists():
print(f"❌ No record found for {snapshot_id}")
return
with open(record_path, 'r') as f:
record = json.load(f)
downloaded_file = record.get("downloaded_file")
if not downloaded_file:
print(f"❌ No downloaded file for {snapshot_id}")
return
file_path = Path(downloaded_file)
if not file_path.exists():
print(f"❌ Downloaded file not found: {file_path}")
return
print(f"📄 Viewing {file_path} (first {max_lines} lines):")
print("=" * 80)
try:
if file_path.suffix == '.json':
with open(file_path, 'r') as f:
data = json.load(f)
print(json.dumps(data, indent=2)[:2000] + "..." if len(str(data)) > 2000 else json.dumps(data, indent=2))
else:
with open(file_path, 'r', encoding='utf-8') as f:
for i, line in enumerate(f):
if i >= max_lines:
print("...")
break
print(line.rstrip())
except Exception as e:
print(f"❌ Error reading file: {e}")
def monitor_processing(self, check_interval: int = 30):
"""Monitor all processing snapshots"""
records = self.list_records()
processing = [r for r in records if r["status"] in ["scheduled", "building", "submitted"]]
if not processing:
print("📭 No snapshots currently processing")
return
print(f"🔍 Monitoring {len(processing)} processing snapshots...")
print("Press Ctrl+C to stop monitoring")
try:
while True:
print(f"\n⏰ {datetime.now().strftime('%H:%M:%S')} - Checking status...")
for record in processing:
snapshot_id = record["snapshot_id"]
metadata = self.check_status(snapshot_id)
if "error" not in metadata:
status = metadata.get("status", "unknown")
print(f" {snapshot_id}: {status}")
if status == "ready":
print(f" ✅ {snapshot_id} is ready for download!")
elif status == "failed":
print(f" ❌ {snapshot_id} failed")
print(f"⏳ Waiting {check_interval} seconds...")
import time
time.sleep(check_interval)
except KeyboardInterrupt:
print("\n🛑 Monitoring stopped")
def main():
"""Main CLI interface"""
parser = argparse.ArgumentParser(description="BrightData Snapshot Manager")
parser.add_argument("--list", "-l", action="store_true", help="List all snapshot records")
parser.add_argument("--status", "-s", action="store_true", help="Check status of all snapshots")
parser.add_argument("--download", "-d", type=str, help="Download specific snapshot by ID")
parser.add_argument("--view", "-v", type=str, help="View downloaded data for snapshot ID")
parser.add_argument("--monitor", "-m", action="store_true", help="Monitor processing snapshots")
parser.add_argument("--storage-dir", type=str, default="snapshot_records", help="Storage directory for records")
parser.add_argument("--output-dir", type=str, default="data/downloads", help="Output directory for downloads")
parser.add_argument("--lines", type=int, default=20, help="Number of lines to show when viewing data")
args = parser.parse_args()
# Initialize manager
try:
manager = SnapshotManager(args.storage_dir)
except SystemExit:
return
# Execute requested actions
if args.list:
records = manager.list_records()
manager.print_records_table(records)
if args.status:
manager.check_all_status()
if args.download:
print(f"📥 Downloading snapshot: {args.download}")
success = manager.download_snapshot(args.download, args.output_dir)
if success:
print(f"✅ Successfully downloaded {args.download}")
else:
print(f"❌ Failed to download {args.download}")
if args.view:
manager.view_downloaded_data(args.view, args.lines)
if args.monitor:
manager.monitor_processing()
# If no arguments provided, show help
if not any([args.list, args.status, args.download, args.view, args.monitor]):
parser.print_help()
print("\n💡 Quick start:")
print(" python snapshot_manager.py --list # List all snapshots")
print(" python snapshot_manager.py --status # Check all statuses")
print(" python snapshot_manager.py --download <snapshot_id> # Download specific snapshot")
if __name__ == "__main__":
main()