RoboAdvisingWithGoalBasedPortfolioOptimization/data_processor.py at master · elroyhaw/RoboAdvisingWithGoalBasedPortfolioOptimization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import pandas as pd
import numpy as np
import pandas_datareader.data as web
from datetime import datetime


def load_china_p2p_data():
    """
    Load and process P2P lending data

    """

    df = pd.read_csv("data/chinaP2PData.csv")
    df.drop(index=0, inplace=True)
    df.reset_index(drop=True, inplace=True)
    df = df[["funded_amnt", "emp_length", "home_ownership", "annual_inc", "purpose", "dti", "int_rate"]]
    df.dropna(inplace=True)
    df = df[df.purpose != "other"]
    df.funded_amnt = df.funded_amnt.astype(np.int64)
    df.annual_inc = df.annual_inc.astype(np.float64)
    df.dti = df.dti.astype(np.float64)
    df.int_rate = df.int_rate.apply(lambda x: np.float64(x.replace("%", "")))
    int_rate_map = {int_rate: i for i, int_rate in enumerate(sorted(df.int_rate.unique()))}
    df.int_rate = df.int_rate.apply(lambda x: int_rate_map[x])

    def format_emp_length(emp_length):
        if "+" in emp_length:
            emp_length = emp_length.replace("+ years", "")
        elif "<" in emp_length:
            emp_length = emp_length.replace("< ", "").replace(" year", "")
        else:
            if "1" in emp_length:
                emp_length = emp_length.replace(" year", "")
            else:
                emp_length = emp_length.replace(" years", "")
        return np.int64(emp_length)

    def format_purpose(purpose):
        purposes = ['debt_consolidation', 'home_improvement', 'credit_card', 'vacation',
                    'car', 'medical', 'major_purchase', 'house', 'small_business', 'moving',
                    'renewable_energy']
        purpose_map = dict([(purpose, i) for i, purpose in enumerate(purposes)])
        return purpose_map[purpose]

    df.emp_length = df.emp_length.apply(format_emp_length)
    df.funded_amnt /= 1000
    df.annual_inc /= 1000
    df = df[df.dti >= 0]
    df.purpose = df.purpose.apply(format_purpose)
    # drop anomalies
    df.drop(index=[18534, 17181, 6213], inplace=True)

    return df


def load_imdb_data():
    """
    Load and process IMDb data

    """

    df = pd.read_csv("data/movie_metadata.csv")
    df = df[(df["title_year"] >= 2011) & (df["title_year"] <= 2013)]
    df = df[["gross", "imdb_score", "content_rating"]]
    df = df[df["gross"] >= 6000000.0]
    df = df[(df["content_rating"] == "PG-13") | (df["content_rating"] == "PG")]
    df["content_rating"] = df["content_rating"].apply(lambda x: 0 if x == "PG-13" else 1)
    df["gross"] /= 10 ** 6
    df.dropna(inplace=True)

    return df


def load_assets(tickers, start_date=None):
    """
    Get returns and covariance matrix of selected tickers

    :param tickers: List of tickers
    :param start_date: Start date
    :type tickers: list
    :type start_date: datetime
    :return: Tuple of returns, covariance
    :rtype: tuple
    """


    data = []
    for ticker in tickers:
        if start_date:
            adj_close = web.DataReader(ticker,
                                       data_source="yahoo",
                                       start=start_date,
                                       end=datetime.now())['Adj Close']
        else:
            adj_close = web.DataReader(ticker,
                                       data_source="yahoo",
                                       start=datetime(2019, 1, 1),
                                       end=datetime.now())['Adj Close']
        adj_close = adj_close[~adj_close.index.duplicated(keep='first')]
        adj_close.name = ticker
        data.append(adj_close)

    all_adj_close = pd.concat(data, axis=1)
    all_adj_close.dropna(inplace=True)
    returns = all_adj_close.diff() / all_adj_close.shift(1)
    returns.dropna(inplace=True)

    mu = returns.iloc[-1]
    sigma = np.cov(returns, rowvar=False)

    return mu, sigma