-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_loader.py
More file actions
165 lines (141 loc) · 5.99 KB
/
data_loader.py
File metadata and controls
165 lines (141 loc) · 5.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import pandas as pd
import numpy as np
from typing import Optional
from data_sources import YFinanceSource, HistoricalDataSource
# Expected CSV columns: timestamp, bid, ask, bid_size, ask_size, last, last_size, symbol
# timestamp should be parseable to pandas datetime (UTC preferred)
def get_real_market_data(symbol: str = "AAPL", period: str = "5d") -> pd.DataFrame:
"""
Get real market data using Yahoo Finance as primary source.
Falls back to realistic sample data if Yahoo Finance fails.
"""
print(f"🔄 Fetching real market data for {symbol}...")
# Try Yahoo Finance first
try:
yf_source = YFinanceSource()
data = yf_source.fetch_data(symbol, period=period)
if yf_source.validate_data(data):
print(f"✅ Successfully loaded {len(data)} ticks from Yahoo Finance")
return normalize_data(data)
except Exception as e:
print(f"⚠️ Yahoo Finance failed: {e}")
# Fallback to realistic sample data
print("📊 Using realistic sample data based on market patterns...")
historical_source = HistoricalDataSource()
data = historical_source.fetch_data(symbol=symbol, dataset="equity")
print(f"✅ Generated {len(data)} ticks of realistic sample data")
return normalize_data(data)
def normalize_data(df: pd.DataFrame) -> pd.DataFrame:
"""Normalize data format for consistent processing."""
df = df.copy()
# Ensure timestamp is datetime and set as index
if 'timestamp' in df.columns:
df['timestamp'] = pd.to_datetime(df['timestamp'])
if df.index.name != 'timestamp':
df = df.set_index('timestamp')
# Ensure required columns exist
required_cols = ['bid', 'ask', 'bid_size', 'ask_size']
for col in required_cols:
if col not in df.columns:
raise ValueError(f"Missing required column: {col}")
# Add mid price if not present
if 'mid' not in df.columns:
df['mid'] = (df['bid'] + df['ask']) / 2.0
# Add symbol if missing
if 'symbol' not in df.columns:
df['symbol'] = 'UNKNOWN'
# Sort by timestamp
df = df.sort_index()
return df
def load_data(csv_path: Optional[str] = None, symbol: Optional[str] = None) -> pd.DataFrame:
"""
Load historical tick/order book data from CSV.
Maintains backward compatibility with existing CSV loading.
"""
if not csv_path:
raise ValueError("CSV path is required for load_data function")
df = pd.read_csv(csv_path)
# normalize columns
df.columns = [c.lower() for c in df.columns]
if 'timestamp' not in df.columns:
raise ValueError("CSV must contain a 'timestamp' column")
df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True, errors='coerce')
df = df.dropna(subset=['timestamp']).sort_values('timestamp')
# Ensure required cols exist, fill missing with NaN where appropriate
for col in ['bid','ask','bid_size','ask_size','last','last_size','symbol']:
if col not in df.columns:
df[col] = np.nan if 'size' not in col else 0.0
# Forward-fill bid/ask for missing ticks
df[['bid','ask','bid_size','ask_size']] = df[['bid','ask','bid_size','ask_size']].ffill()
# compute mid
df['mid'] = (df['bid'] + df['ask']) / 2.0
df.set_index('timestamp', inplace=True)
if symbol:
df['symbol'] = symbol
elif 'symbol' not in df.columns:
df['symbol'] = 'TICK'
return df
def make_synthetic_orderbook(start: str = '2024-01-02 09:30:00', periods: int = 1200, freq: str = 'S',
start_price: float = 100.0, symbol: str = 'SYN') -> pd.DataFrame:
"""
Generate a synthetic order book time series with bid/ask, last trade and sizes.
- periods: number of ticks
- freq: pandas offset alias (e.g., 'S' seconds)
"""
idx = pd.date_range(start=start, periods=periods, freq=freq, tz='UTC')
rng = np.random.default_rng(42)
# price as random walk
steps = rng.normal(0, 0.01, size=periods).cumsum()
mid = start_price + steps
spread = np.clip(rng.normal(0.01, 0.005, size=periods), 0.005, 0.05)
bid = mid - spread/2
ask = mid + spread/2
bid_size = rng.integers(100, 1000, size=periods)
ask_size = rng.integers(100, 1000, size=periods)
# last trade oscillates around mid
last = mid + rng.normal(0, 0.005, size=periods)
last_size = rng.integers(50, 500, size=periods)
df = pd.DataFrame({
'symbol': symbol,
'bid': bid,
'ask': ask,
'bid_size': bid_size.astype(float),
'ask_size': ask_size.astype(float),
'last': last,
'last_size': last_size.astype(float),
'mid': mid
}, index=idx)
df.index.name = 'timestamp'
return df
def make_synthetic_orderbook(start: str = '2024-01-02 09:30:00', periods: int = 1200, freq: str = 'S',
start_price: float = 100.0, symbol: str = 'SYN') -> pd.DataFrame:
"""
Generate a synthetic order book time series with bid/ask, last trade and sizes.
- periods: number of ticks
- freq: pandas offset alias (e.g., 'S' seconds)
"""
idx = pd.date_range(start=start, periods=periods, freq=freq, tz='UTC')
rng = np.random.default_rng(42)
# price as random walk
steps = rng.normal(0, 0.01, size=periods).cumsum()
mid = start_price + steps
spread = np.clip(rng.normal(0.01, 0.005, size=periods), 0.005, 0.05)
bid = mid - spread/2
ask = mid + spread/2
bid_size = rng.integers(100, 1000, size=periods)
ask_size = rng.integers(100, 1000, size=periods)
# last trade oscillates around mid
last = mid + rng.normal(0, 0.005, size=periods)
last_size = rng.integers(50, 500, size=periods)
df = pd.DataFrame({
'symbol': symbol,
'bid': bid,
'ask': ask,
'bid_size': bid_size.astype(float),
'ask_size': ask_size.astype(float),
'last': last,
'last_size': last_size.astype(float),
'mid': mid
}, index=idx)
df.index.name = 'timestamp'
return df