Spaces:

kristada673
/

roboadvisor

Runtime error

+import copy
+import os
+import urllib
+import zipfile
+from datetime import *
+from pathlib import Path
+from typing import List
+import numpy as np
+import pandas as pd
+import stockstats
+import talib
+from finnlp.utils.config import BINANCE_BASE_URL
+from finnlp.utils.config import TIME_ZONE_BERLIN
+from finnlp.utils.config import TIME_ZONE_JAKARTA
+from finnlp.utils.config import TIME_ZONE_PARIS
+from finnlp.utils.config import TIME_ZONE_SELFDEFINED
+from finnlp.utils.config import TIME_ZONE_SHANGHAI
+from finnlp.utils.config import TIME_ZONE_USEASTERN
+from finnlp.utils.config import USE_TIME_ZONE_SELFDEFINED
+from finnlp.utils.config_tickers import CAC_40_TICKER
+from finnlp.utils.config_tickers import CSI_300_TICKER
+from finnlp.utils.config_tickers import DAX_30_TICKER
+from finnlp.utils.config_tickers import DOW_30_TICKER
+from finnlp.utils.config_tickers import HSI_50_TICKER
+from finnlp.utils.config_tickers import LQ45_TICKER
+from finnlp.utils.config_tickers import MDAX_50_TICKER
+from finnlp.utils.config_tickers import NAS_100_TICKER
+from finnlp.utils.config_tickers import SDAX_50_TICKER
+from finnlp.utils.config_tickers import SP_500_TICKER
+from finnlp.utils.config_tickers import SSE_50_TICKER
+from finnlp.utils.config_tickers import TECDAX_TICKER
+class _Base:
+    def __init__(
+        self,
+        data_source: str,
+        start_date: str,
+        end_date: str,
+        time_interval: str,
+        **kwargs,
+    ):
+        self.data_source: str = data_source
+        self.start_date: str = start_date
+        self.end_date: str = end_date
+        self.time_interval: str = time_interval  # standard time_interval
+        # transferred_time_interval will be supported in the future.
+        # self.nonstandard_time_interval: str = self.calc_nonstandard_time_interval()  # transferred time_interval of this processor
+        self.time_zone: str = ""
+        self.dataframe: pd.DataFrame = pd.DataFrame()
+        self.dictnumpy: dict = (
+            {}
+        )  # e.g., self.dictnumpy["open"] = np.array([1, 2, 3]), self.dictnumpy["close"] = np.array([1, 2, 3])
+    def download_data(self, ticker_list: List[str]):
+        pass
+    def clean_data(self):
+        if "date" in self.dataframe.columns.values.tolist():
+            self.dataframe.rename(columns={"date": "time"}, inplace=True)
+        if "datetime" in self.dataframe.columns.values.tolist():
+            self.dataframe.rename(columns={"datetime": "time"}, inplace=True)
+        if self.data_source == "ccxt":
+            self.dataframe.rename(columns={"index": "time"}, inplace=True)
+        if self.data_source == "ricequant":
+            """RiceQuant data is already cleaned, we only need to transform data format here.
+            No need for filling NaN data"""
+            self.dataframe.rename(columns={"order_book_id": "tic"}, inplace=True)
+            # raw df uses multi-index (tic,time), reset it to single index (time)
+            self.dataframe.reset_index(level=[0, 1], inplace=True)
+            # check if there is NaN values
+            assert not self.dataframe.isnull().values.any()
+        elif self.data_source == "baostock":
+            self.dataframe.rename(columns={"code": "tic"}, inplace=True)
+        self.dataframe.dropna(inplace=True)
+        # adjusted_close: adjusted close price
+        if "adjusted_close" not in self.dataframe.columns.values.tolist():
+            self.dataframe["adjusted_close"] = self.dataframe["close"]
+        self.dataframe.sort_values(by=["time", "tic"], inplace=True)
+        self.dataframe = self.dataframe[
+            [
+                "tic",
+                "time",
+                "open",
+                "high",
+                "low",
+                "close",
+                "adjusted_close",
+                "volume",
+            ]
+        ]
+    def fillna(self):
+        df = self.dataframe
+        dfcode = pd.DataFrame(columns=["tic"])
+        dfdate = pd.DataFrame(columns=["time"])
+        dfcode.tic = df.tic.unique()
+        dfdate.time = df.time.unique()
+        dfdate.sort_values(by="time", ascending=False, ignore_index=True, inplace=True)
+        # the old pandas may not support pd.merge(how="cross")
+        try:
+            df1 = pd.merge(dfcode, dfdate, how="cross")
+        except:
+            print("Please wait for a few seconds...")
+            df1 = pd.DataFrame(columns=["tic", "time"])
+            for i in range(dfcode.shape[0]):
+                for j in range(dfdate.shape[0]):
+                    df1 = df1.append(
+                        pd.DataFrame(
+                            data={
+                                "tic": dfcode.iat[i, 0],
+                                "time": dfdate.iat[j, 0],
+                            },
+                            index=[(i + 1) * (j + 1) - 1],
+                        )
+                    )
+        df = pd.merge(df1, df, how="left", on=["tic", "time"])
+        # back fill missing data then front fill
+        df_new = pd.DataFrame(columns=df.columns)
+        for i in df.tic.unique():
+            df_tmp = df[df.tic == i].fillna(method="bfill").fillna(method="ffill")
+            df_new = pd.concat([df_new, df_tmp], ignore_index=True)
+        df_new = df_new.fillna(0)
+        # reshape dataframe
+        df_new = df_new.sort_values(by=["time", "tic"]).reset_index(drop=True)
+        print("Shape of DataFrame: ", df_new.shape)
+        self.dataframe = df_new
+    def get_trading_days(self, start: str, end: str) -> List[str]:
+        if self.data_source in [
+            "binance",
+            "ccxt",
+            "quantconnect",
+            "ricequant",
+            "tushare",
+        ]:
+            print(
+                f"Calculate get_trading_days not supported for {self.data_source} yet."
+            )
+            return None
+    # select_stockstats_talib: 0 (stockstats, default), or 1 (use talib). Users can choose the method.
+    # drop_na_timestep: 0 (not dropping timesteps that contain nan), or 1 (dropping timesteps that contain nan, default). Users can choose the method.
+    def add_technical_indicator(
+        self,
+        tech_indicator_list: List[str],
+        select_stockstats_talib: int = 0,
+        drop_na_timesteps: int = 1,
+    ):
+        """
+        calculate technical indicators
+        use stockstats/talib package to add technical inidactors
+        :param data: (df) pandas dataframe
+        :return: (df) pandas dataframe
+        """
+        if "date" in self.dataframe.columns.values.tolist():
+            self.dataframe.rename(columns={"date": "time"}, inplace=True)
+        if self.data_source == "ccxt":
+            self.dataframe.rename(columns={"index": "time"}, inplace=True)
+        self.dataframe.reset_index(drop=False, inplace=True)
+        if "level_1" in self.dataframe.columns:
+            self.dataframe.drop(columns=["level_1"], inplace=True)
+        if "level_0" in self.dataframe.columns and "tic" not in self.dataframe.columns:
+            self.dataframe.rename(columns={"level_0": "tic"}, inplace=True)
+        assert select_stockstats_talib in {0, 1}
+        print("tech_indicator_list: ", tech_indicator_list)
+        if select_stockstats_talib == 0:  # use stockstats
+            stock = stockstats.StockDataFrame.retype(self.dataframe)
+            unique_ticker = stock.tic.unique()
+            for indicator in tech_indicator_list:
+                print("indicator: ", indicator)
+                indicator_df = pd.DataFrame()
+                for i in range(len(unique_ticker)):
+                    try:
+                        temp_indicator = stock[stock.tic == unique_ticker[i]][indicator]
+                        temp_indicator = pd.DataFrame(temp_indicator)
+                        temp_indicator["tic"] = unique_ticker[i]
+                        temp_indicator["time"] = self.dataframe[
+                            self.dataframe.tic == unique_ticker[i]
+                        ]["time"].to_list()
+                        indicator_df = pd.concat(
+                            [indicator_df, temp_indicator],
+                            axis=0,
+                            join="outer",
+                            ignore_index=True,
+                        )
+                    except Exception as e:
+                        print(e)
+                if not indicator_df.empty:
+                    self.dataframe = self.dataframe.merge(
+                        indicator_df[["tic", "time", indicator]],
+                        on=["tic", "time"],
+                        how="left",
+                    )
+        else:  # use talib
+            final_df = pd.DataFrame()
+            for i in self.dataframe.tic.unique():
+                tic_df = self.dataframe[self.dataframe.tic == i]
+                (
+                    tic_df.loc["macd"],
+                    tic_df.loc["macd_signal"],
+                    tic_df.loc["macd_hist"],
+                ) = talib.MACD(
+                    tic_df["close"],
+                    fastperiod=12,
+                    slowperiod=26,
+                    signalperiod=9,
+                )
+                tic_df.loc["rsi"] = talib.RSI(tic_df["close"], timeperiod=14)
+                tic_df.loc["cci"] = talib.CCI(
+                    tic_df["high"],
+                    tic_df["low"],
+                    tic_df["close"],
+                    timeperiod=14,
+                )
+                tic_df.loc["dx"] = talib.DX(
+                    tic_df["high"],
+                    tic_df["low"],
+                    tic_df["close"],
+                    timeperiod=14,
+                )
+                final_df = pd.concat([final_df, tic_df], axis=0, join="outer")
+            self.dataframe = final_df
+        self.dataframe.sort_values(by=["time", "tic"], inplace=True)
+        if drop_na_timesteps:
+            time_to_drop = self.dataframe[
+                self.dataframe.isna().any(axis=1)
+            ].time.unique()
+            self.dataframe = self.dataframe[~self.dataframe.time.isin(time_to_drop)]
+        print("Succesfully add technical indicators")
+    def add_turbulence(self):
+        """
+        add turbulence index from a precalcualted dataframe
+        :param data: (df) pandas dataframe
+        :return: (df) pandas dataframe
+        """
+        # df = data.copy()
+        # turbulence_index = self.calculate_turbulence(df)
+        # df = df.merge(turbulence_index, on="time")
+        # df = df.sort_values(["time", "tic"]).reset_index(drop=True)
+        # return df
+        if self.data_source in [
+            "binance",
+            "ccxt",
+            "iexcloud",
+            "joinquant",
+            "quantconnect",
+        ]:
+            print(
+                f"Turbulence not supported for {self.data_source} yet. Return original DataFrame."
+            )
+        if self.data_source in [
+            "alpaca",
+            "ricequant",
+            "tushare",
+            "wrds",
+            "yahoofinance",
+        ]:
+            turbulence_index = self.calculate_turbulence()
+            self.dataframe = self.dataframe.merge(turbulence_index, on="time")
+            self.dataframe.sort_values(["time", "tic"], inplace=True)
+            self.dataframe.reset_index(drop=True, inplace=True)
+    def calculate_turbulence(self, time_period: int = 252) -> pd.DataFrame:
+        """calculate turbulence index based on dow 30"""
+        # can add other market assets
+        df_price_pivot = self.dataframe.pivot(
+            index="time", columns="tic", values="close"
+        )
+        # use returns to calculate turbulence
+        df_price_pivot = df_price_pivot.pct_change()
+        unique_date = self.dataframe["time"].unique()
+        # start after a year
+        start = time_period
+        turbulence_index = [0] * start
+        # turbulence_index = [0]
+        count = 0
+        for i in range(start, len(unique_date)):
+            current_price = df_price_pivot[df_price_pivot.index == unique_date[i]]
+            # use one year rolling window to calcualte covariance
+            hist_price = df_price_pivot[
+                (df_price_pivot.index < unique_date[i])
+                & (df_price_pivot.index >= unique_date[i - time_period])
+            ]
+            # Drop tickers which has number missing values more than the "oldest" ticker
+            filtered_hist_price = hist_price.iloc[
+                hist_price.isna().sum().min() :
+            ].dropna(axis=1)
+            cov_temp = filtered_hist_price.cov()
+            current_temp = current_price[list(filtered_hist_price)] - np.mean(
+                filtered_hist_price, axis=0
+            )
+            # cov_temp = hist_price.cov()
+            # current_temp=(current_price - np.mean(hist_price,axis=0))
+            temp = current_temp.values.dot(np.linalg.pinv(cov_temp)).dot(
+                current_temp.values.T
+            )
+            if temp > 0:
+                count += 1
+                # avoid large outlier because of the calculation just begins: else turbulence_temp = 0
+                turbulence_temp = temp[0][0] if count > 2 else 0
+            else:
+                turbulence_temp = 0
+            turbulence_index.append(turbulence_temp)
+        turbulence_index = pd.DataFrame(
+            {"time": df_price_pivot.index, "turbulence": turbulence_index}
+        )
+        return turbulence_index
+    def add_vix(self):
+        """
+        add vix from processors
+        :param data: (df) pandas dataframe
+        :return: (df) pandas dataframe
+        """
+        if self.data_source in [
+            "binance",
+            "ccxt",
+            "iexcloud",
+            "joinquant",
+            "quantconnect",
+            "ricequant",
+            "tushare",
+        ]:
+            print(
+                f"VIX is not applicable for {self.data_source}. Return original DataFrame"
+            )
+            return None
+        # if self.data_source == 'yahoofinance':
+        #     df = data.copy()
+        #     df_vix = self.download_data(
+        #         start_date=df.time.min(),
+        #         end_date=df.time.max(),
+        #         ticker_list=["^VIX"],
+        #         time_interval=self.time_interval,
+        #     )
+        #     df_vix = self.clean_data(df_vix)
+        #     vix = df_vix[["time", "adjusted_close"]]
+        #     vix.columns = ["time", "vix"]
+        #
+        #     df = df.merge(vix, on="time")
+        #     df = df.sort_values(["time", "tic"]).reset_index(drop=True)
+        # elif self.data_source == 'alpaca':
+        #     vix_df = self.download_data(["VIXY"], self.start, self.end, self.time_interval)
+        #     cleaned_vix = self.clean_data(vix_df)
+        #     vix = cleaned_vix[["time", "close"]]
+        #     vix = vix.rename(columns={"close": "VIXY"})
+        #
+        #     df = data.copy()
+        #     df = df.merge(vix, on="time")
+        #     df = df.sort_values(["time", "tic"]).reset_index(drop=True)
+        # elif self.data_source == 'wrds':
+        #     vix_df = self.download_data(['vix'], self.start, self.end_date, self.time_interval)
+        #     cleaned_vix = self.clean_data(vix_df)
+        #     vix = cleaned_vix[['date', 'close']]
+        #
+        #     df = data.copy()
+        #     df = df.merge(vix, on="date")
+        #     df = df.sort_values(["date", "tic"]).reset_index(drop=True)
+        elif self.data_source == "yahoofinance":
+            ticker = "^VIX"
+        elif self.data_source == "alpaca":
+            ticker = "VIXY"
+        elif self.data_source == "wrds":
+            ticker = "vix"
+        else:
+            pass
+        df = self.dataframe.copy()
+        self.dataframe = [ticker]
+        # self.download_data(self.start_date, self.end_date, self.time_interval)
+        self.download_data([ticker], save_path="./data/vix.csv")
+        self.clean_data()
+        cleaned_vix = self.dataframe
+        # .rename(columns={ticker: "vix"})
+        vix = cleaned_vix[["time", "close"]]
+        cleaned_vix = vix.rename(columns={"close": "vix"})
+        df = df.merge(cleaned_vix, on="time")
+        df = df.sort_values(["time", "tic"]).reset_index(drop=True)
+        self.dataframe = df
+    def df_to_array(self, tech_indicator_list: List[str], if_vix: bool):
+        unique_ticker = self.dataframe.tic.unique()
+        price_array = np.column_stack(
+            [self.dataframe[self.dataframe.tic == tic].close for tic in unique_ticker]
+        )
+        common_tech_indicator_list = [
+            i
+            for i in tech_indicator_list
+            if i in self.dataframe.columns.values.tolist()
+        ]
+        tech_array = np.hstack(
+            [
+                self.dataframe.loc[
+                    (self.dataframe.tic == tic), common_tech_indicator_list
+                ]
+                for tic in unique_ticker
+            ]
+        )
+        if if_vix:
+            risk_array = np.column_stack(
+                [self.dataframe[self.dataframe.tic == tic].vix for tic in unique_ticker]
+            )
+        else:
+            risk_array = (
+                np.column_stack(
+                    [
+                        self.dataframe[self.dataframe.tic == tic].turbulence
+                        for tic in unique_ticker
+                    ]
+                )
+                if "turbulence" in self.dataframe.columns
+                else None
+            )
+        print("Successfully transformed into array")
+        return price_array, tech_array, risk_array
+    # standard_time_interval  s: second, m: minute, h: hour, d: day, w: week, M: month, q: quarter, y: year
+    # output time_interval of the processor
+    def calc_nonstandard_time_interval(self) -> str:
+        if self.data_source == "alpaca":
+            pass
+        elif self.data_source == "baostock":
+            # nonstandard_time_interval: 默认为d，日k线；d=日k线、w=周、m=月、5=5分钟、15=15分钟、30=30分钟、60=60分钟k线数据，不区分大小写；指数没有分钟线数据；周线每周最后一个交易日才可以获取，月线每月最后一个交易日才可以获取。
+            pass
+            time_intervals = ["5m", "15m", "30m", "60m", "1d", "1w", "1M"]
+            assert self.time_interval in time_intervals, (
+                "This time interval is not supported. Supported time intervals: "
+                + ",".join(time_intervals)
+            )
+            if (
+                "d" in self.time_interval
+                or "w" in self.time_interval
+                or "M" in self.time_interval
+            ):
+                return self.time_interval[-1:].lower()
+            elif "m" in self.time_interval:
+                return self.time_interval[:-1]
+        elif self.data_source == "binance":
+            # nonstandard_time_interval: 1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d,3d,1w,1M
+            time_intervals = [
+                "1m",
+                "3m",
+                "5m",
+                "15m",
+                "30m",
+                "1h",
+                "2h",
+                "4h",
+                "6h",
+                "8h",
+                "12h",
+                "1d",
+                "3d",
+                "1w",
+                "1M",
+            ]
+            assert self.time_interval in time_intervals, (
+                "This time interval is not supported. Supported time intervals: "
+                + ",".join(time_intervals)
+            )
+            return self.time_interval
+        elif self.data_source == "ccxt":
+            pass
+        elif self.data_source == "iexcloud":
+            time_intervals = ["1d"]
+            assert self.time_interval in time_intervals, (
+                "This time interval is not supported. Supported time intervals: "
+                + ",".join(time_intervals)
+            )
+            return self.time_interval.upper()
+        elif self.data_source == "joinquant":
+            # '1m', '5m', '15m', '30m', '60m', '120m', '1d', '1w', '1M'
+            time_intervals = [
+                "1m",
+                "5m",
+                "15m",
+                "30m",
+                "60m",
+                "120m",
+                "1d",
+                "1w",
+                "1M",
+            ]
+            assert self.time_interval in time_intervals, (
+                "This time interval is not supported. Supported time intervals: "
+                + ",".join(time_intervals)
+            )
+            return self.time_interval
+        elif self.data_source == "quantconnect":
+            pass
+        elif self.data_source == "ricequant":
+            #  nonstandard_time_interval: 'd' - 天，'w' - 周，'m' - 月， 'q' - 季，'y' - 年
+            time_intervals = ["d", "w", "M", "q", "y"]
+            assert self.time_interval[-1] in time_intervals, (
+                "This time interval is not supported. Supported time intervals: "
+                + ",".join(time_intervals)
+            )
+            if "M" in self.time_interval:
+                return self.time_interval.lower()
+            else:
+                return self.time_interval
+        elif self.data_source == "tushare":
+            # 分钟频度包括1分、5、15、30、60分数据. Not support currently.
+            # time_intervals = ["1m", "5m", "15m", "30m", "60m", "1d"]
+            time_intervals = ["1d"]
+            assert self.time_interval in time_intervals, (
+                "This time interval is not supported. Supported time intervals: "
+                + ",".join(time_intervals)
+            )
+            return self.time_interval
+        elif self.data_source == "wrds":
+            pass
+        elif self.data_source == "yahoofinance":
+            # nonstandard_time_interval: ["1m", "2m", "5m", "15m", "30m", "60m", "90m", "1h", "1d", "5d","1wk", "1mo", "3mo"]
+            time_intervals = [
+                "1m",
+                "2m",
+                "5m",
+                "15m",
+                "30m",
+                "60m",
+                "90m",
+                "1h",
+                "1d",
+                "5d",
+                "1w",
+                "1M",
+                "3M",
+            ]
+            assert self.time_interval in time_intervals, (
+                "This time interval is not supported. Supported time intervals: "
+                + ",".join(time_intervals)
+            )
+            if "w" in self.time_interval:
+                return self.time_interval + "k"
+            elif "M" in self.time_interval:
+                return self.time_interval[:-1] + "mo"
+            else:
+                return self.time_interval
+        else:
+            raise ValueError(
+                f"Not support transfer_standard_time_interval for {self.data_source}"
+            )
+    # "600000.XSHG" -> "sh.600000"
+    # "000612.XSHE" -> "sz.000612"
+    def transfer_standard_ticker_to_nonstandard(self, ticker: str) -> str:
+        return ticker
+    def save_data(self, path):
+        if ".csv" in path:
+            path = path.split("/")
+            filename = path[-1]
+            path = "/".join(path[:-1] + [""])
+        else:
+            if path[-1] == "/":
+                filename = "dataset.csv"
+            else:
+                filename = "/dataset.csv"
+        os.makedirs(path, exist_ok=True)
+        self.dataframe.to_csv(path + filename, index=False)
+    def load_data(self, path):
+        assert ".csv" in path  # only support csv format now
+        self.dataframe = pd.read_csv(path)
+        columns = self.dataframe.columns
+        print(f"{path} loaded")
+        # # check loaded file
+        # assert "date" in columns or "time" in columns
+        # assert "close" in columns
+def calc_time_zone(
+    ticker_list: List[str],
+    time_zone_selfdefined: str,
+    use_time_zone_selfdefined: int,
+) -> str:
+    assert isinstance(ticker_list, list)
+    ticker_list = ticker_list[0]
+    if use_time_zone_selfdefined == 1:
+        time_zone = time_zone_selfdefined
+    elif ticker_list in HSI_50_TICKER + SSE_50_TICKER + CSI_300_TICKER:
+        time_zone = TIME_ZONE_SHANGHAI
+    elif ticker_list in DOW_30_TICKER + NAS_100_TICKER + SP_500_TICKER:
+        time_zone = TIME_ZONE_USEASTERN
+    elif ticker_list == CAC_40_TICKER:
+        time_zone = TIME_ZONE_PARIS
+    elif ticker_list in DAX_30_TICKER + TECDAX_TICKER + MDAX_50_TICKER + SDAX_50_TICKER:
+        time_zone = TIME_ZONE_BERLIN
+    elif ticker_list == LQ45_TICKER:
+        time_zone = TIME_ZONE_JAKARTA
+    else:
+        # hack needed to have this working with vix indicator
+        # fix: unable to set time_zone_selfdefined from top-level dataprocessor class
+        time_zone = TIME_ZONE_USEASTERN
+        # raise ValueError("Time zone is wrong.")
+    return time_zone
+def check_date(d: str) -> bool:
+    assert (
+        len(d) == 10
+    ), "Please check the length of date and use the correct date like 2020-01-01."
+    indices = [0, 1, 2, 3, 5, 6, 8, 9]
+    correct = True
+    for i in indices:
+        if not d[i].isdigit():
+            correct = False
+            break
+    if not correct:
+        raise ValueError("Please use the correct date like 2020-01-01.")
+    return correct

finnlp/data_processors/akshare.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import copy
+import os
+import time
+import warnings
+warnings.filterwarnings("ignore")
+from typing import List
+import pandas as pd
+from tqdm import tqdm
+import stockstats
+import talib
+from meta.data_processors._base import _Base
+import akshare as ak  # pip install akshare
+class Akshare(_Base):
+    def __init__(
+        self,
+        data_source: str,
+        start_date: str,
+        end_date: str,
+        time_interval: str,
+        **kwargs,
+    ):
+        start_date = self.transfer_date(start_date)
+        end_date = self.transfer_date(end_date)
+        super().__init__(data_source, start_date, end_date, time_interval, **kwargs)
+        if "adj" in kwargs.keys():
+            self.adj = kwargs["adj"]
+            print(f"Using {self.adj} method.")
+        else:
+            self.adj = ""
+        if "period" in kwargs.keys():
+            self.period = kwargs["period"]
+        else:
+            self.period = "daily"
+    def get_data(self, id) -> pd.DataFrame:
+        return ak.stock_zh_a_hist(
+            symbol=id,
+            period=self.time_interval,
+            start_date=self.start_date,
+            end_date=self.end_date,
+            adjust=self.adj,
+        )
+    def download_data(
+        self, ticker_list: List[str], save_path: str = "./data/dataset.csv"
+    ):
+        """
+        `pd.DataFrame`
+            7 columns: A tick symbol, time, open, high, low, close and volume
+            for the specified stock ticker
+        """
+        assert self.time_interval in [
+            "daily",
+            "weekly",
+            "monthly",
+        ], "Not supported currently"
+        self.ticker_list = ticker_list
+        self.dataframe = pd.DataFrame()
+        for i in tqdm(ticker_list, total=len(ticker_list)):
+            nonstandard_id = self.transfer_standard_ticker_to_nonstandard(i)
+            df_temp = self.get_data(nonstandard_id)
+            df_temp["tic"] = i
+            # df_temp = self.get_data(i)
+            self.dataframe = pd.concat([self.dataframe, df_temp])
+            # self.dataframe = self.dataframe.append(df_temp)
+            # print("{} ok".format(i))
+            time.sleep(0.25)
+        self.dataframe.columns = [
+            "time",
+            "open",
+            "close",
+            "high",
+            "low",
+            "volume",
+            "amount",
+            "amplitude",
+            "pct_chg",
+            "change",
+            "turnover",
+            "tic",
+        ]
+        self.dataframe.sort_values(by=["time", "tic"], inplace=True)
+        self.dataframe.reset_index(drop=True, inplace=True)
+        self.dataframe = self.dataframe[
+            ["tic", "time", "open", "high", "low", "close", "volume"]
+        ]
+        # self.dataframe.loc[:, 'tic'] = pd.DataFrame((self.dataframe['tic'].tolist()))
+        self.dataframe["time"] = pd.to_datetime(
+            self.dataframe["time"], format="%Y-%m-%d"
+        )
+        self.dataframe["day"] = self.dataframe["time"].dt.dayofweek
+        self.dataframe["time"] = self.dataframe.time.apply(
+            lambda x: x.strftime("%Y-%m-%d")
+        )
+        self.dataframe.dropna(inplace=True)
+        self.dataframe.sort_values(by=["time", "tic"], inplace=True)
+        self.dataframe.reset_index(drop=True, inplace=True)
+        self.save_data(save_path)
+        print(
+            f"Download complete! Dataset saved to {save_path}. \nShape of DataFrame: {self.dataframe.shape}"
+        )
+    def data_split(self, df, start, end, target_date_col="time"):
+        """
+        split the dataset into training or testing using time
+        :param data: (df) pandas dataframe, start, end
+        :return: (df) pandas dataframe
+        """
+        data = df[(df[target_date_col] >= start) & (df[target_date_col] < end)]
+        data = data.sort_values([target_date_col, "tic"], ignore_index=True)
+        data.index = data[target_date_col].factorize()[0]
+        return data
+    def transfer_standard_ticker_to_nonstandard(self, ticker: str) -> str:
+        # "600000.XSHG" -> "600000"
+        # "000612.XSHE" -> "000612"
+        # "600000.SH" -> "600000"
+        # "000612.SZ" -> "000612"
+        if "." in ticker:
+            n, alpha = ticker.split(".")
+            # assert alpha in ["XSHG", "XSHE"], "Wrong alpha"
+        return n
+    def transfer_date(self, time: str) -> str:
+        if "-" in time:
+            time = "".join(time.split("-"))
+        elif "." in time:
+            time = "".join(time.split("."))
+        elif "/" in time:
+            time = "".join(time.split("/"))
+        return time

finnlp/data_processors/alpaca.py ADDED Viewed

	@@ -0,0 +1,441 @@

+from typing import List
+import alpaca_trade_api as tradeapi
+import numpy as np
+import pandas as pd
+import pytz
+try:
+    import exchange_calendars as tc
+except:
+    print(
+        "Cannot import exchange_calendars.",
+        "If you are using python>=3.7, please install it.",
+    )
+    import trading_calendars as tc
+    print("Use trading_calendars instead for alpaca processor.")
+# from basic_processor import _Base
+from meta.data_processors._base import _Base
+from meta.data_processors._base import calc_time_zone
+from meta.config import (
+    TIME_ZONE_SHANGHAI,
+    TIME_ZONE_USEASTERN,
+    TIME_ZONE_PARIS,
+    TIME_ZONE_BERLIN,
+    TIME_ZONE_JAKARTA,
+    TIME_ZONE_SELFDEFINED,
+    USE_TIME_ZONE_SELFDEFINED,
+    BINANCE_BASE_URL,
+)
+class Alpaca(_Base):
+    # def __init__(self, API_KEY=None, API_SECRET=None, API_BASE_URL=None, api=None):
+    #     if api is None:
+    #         try:
+    #             self.api = tradeapi.REST(API_KEY, API_SECRET, API_BASE_URL, "v2")
+    #         except BaseException:
+    #             raise ValueError("Wrong Account Info!")
+    #     else:
+    #         self.api = api
+    def __init__(
+        self,
+        data_source: str,
+        start_date: str,
+        end_date: str,
+        time_interval: str,
+        **kwargs,
+    ):
+        super().__init__(data_source, start_date, end_date, time_interval, **kwargs)
+        if kwargs["API"] is None:
+            try:
+                self.api = tradeapi.REST(
+                    kwargs["API_KEY"],
+                    kwargs["API_SECRET"],
+                    kwargs["API_BASE_URL"],
+                    "v2",
+                )
+            except BaseException:
+                raise ValueError("Wrong Account Info!")
+        else:
+            self.api = kwargs["API"]
+    def download_data(
+        self,
+        ticker_list,
+        start_date,
+        end_date,
+        time_interval,
+        save_path: str = "./data/dataset.csv",
+    ) -> pd.DataFrame:
+        self.time_zone = calc_time_zone(
+            ticker_list, TIME_ZONE_SELFDEFINED, USE_TIME_ZONE_SELFDEFINED
+        )
+        start_date = pd.Timestamp(self.start_date, tz=self.time_zone)
+        end_date = pd.Timestamp(self.end_date, tz=self.time_zone) + pd.Timedelta(days=1)
+        self.time_interval = time_interval
+        date = start_date
+        data_df = pd.DataFrame()
+        while date != end_date:
+            start_time = (date + pd.Timedelta("09:30:00")).isoformat()
+            end_time = (date + pd.Timedelta("15:59:00")).isoformat()
+            for tic in ticker_list:
+                barset = self.api.get_bars(
+                    tic,
+                    time_interval,
+                    start=start_time,
+                    end=end_time,
+                    limit=500,
+                ).df
+                barset["tic"] = tic
+                barset = barset.reset_index()
+                data_df = data_df.append(barset)
+            print(("Data before ") + end_time + " is successfully fetched")
+            # print(data_df.head())
+            date = date + pd.Timedelta(days=1)
+            if date.isoformat()[-14:-6] == "01:00:00":
+                date = date - pd.Timedelta("01:00:00")
+            elif date.isoformat()[-14:-6] == "23:00:00":
+                date = date + pd.Timedelta("01:00:00")
+            if date.isoformat()[-14:-6] != "00:00:00":
+                raise ValueError("Timezone Error")
+        data_df["time"] = data_df["timestamp"].apply(
+            lambda x: x.strftime("%Y-%m-%d %H:%M:%S")
+        )
+        self.dataframe = data_df
+        self.save_data(save_path)
+        print(
+            f"Download complete! Dataset saved to {save_path}. \nShape of DataFrame: {self.dataframe.shape}"
+        )
+    def clean_data(self):
+        df = self.dataframe.copy()
+        tic_list = np.unique(df.tic.values)
+        trading_days = self.get_trading_days(start=self.start, end=self.end)
+        # produce full time index
+        times = []
+        for day in trading_days:
+            current_time = pd.Timestamp(day + " 09:30:00").tz_localize(self.time_zone)
+            for _ in range(390):
+                times.append(current_time)
+                current_time += pd.Timedelta(minutes=1)
+        # create a new dataframe with full time series
+        new_df = pd.DataFrame()
+        for tic in tic_list:
+            tmp_df = pd.DataFrame(
+                columns=["open", "high", "low", "close", "volume"], index=times
+            )
+            tic_df = df[df.tic == tic]
+            for i in range(tic_df.shape[0]):
+                tmp_df.loc[tic_df.iloc[i]["time"]] = tic_df.iloc[i][
+                    ["open", "high", "low", "close", "volume"]
+                ]
+            # if the close price of the first row is NaN
+            if str(tmp_df.iloc[0]["close"]) == "nan":
+                print(
+                    "The price of the first row for ticker ",
+                    tic,
+                    " is NaN. ",
+                    "It will filled with the first valid price.",
+                )
+                for i in range(tmp_df.shape[0]):
+                    if str(tmp_df.iloc[i]["close"]) != "nan":
+                        first_valid_price = tmp_df.iloc[i]["close"]
+                        tmp_df.iloc[0] = [
+                            first_valid_price,
+                            first_valid_price,
+                            first_valid_price,
+                            first_valid_price,
+                            0.0,
+                        ]
+                        break
+            # if the close price of the first row is still NaN (All the prices are NaN in this case)
+            if str(tmp_df.iloc[0]["close"]) == "nan":
+                print(
+                    "Missing data for ticker: ",
+                    tic,
+                    " . The prices are all NaN. Fill with 0.",
+                )
+                tmp_df.iloc[0] = [
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                ]
+            # forward filling row by row
+            for i in range(tmp_df.shape[0]):
+                if str(tmp_df.iloc[i]["close"]) == "nan":
+                    previous_close = tmp_df.iloc[i - 1]["close"]
+                    if str(previous_close) == "nan":
+                        raise ValueError
+                    tmp_df.iloc[i] = [
+                        previous_close,
+                        previous_close,
+                        previous_close,
+                        previous_close,
+                        0.0,
+                    ]
+            tmp_df = tmp_df.astype(float)
+            tmp_df["tic"] = tic
+            new_df = new_df.append(tmp_df)
+        new_df = new_df.reset_index()
+        new_df = new_df.rename(columns={"index": "time"})
+        print("Data clean finished!")
+        self.dataframe = new_df
+    # def add_technical_indicator(
+    #     self,
+    #     df,
+    #     tech_indicator_list=[
+    #         "macd",
+    #         "boll_ub",
+    #         "boll_lb",
+    #         "rsi_30",
+    #         "dx_30",
+    #         "close_30_sma",
+    #         "close_60_sma",
+    #     ],
+    # ):
+    #     df = df.rename(columns={"time": "date"})
+    #     df = df.copy()
+    #     df = df.sort_values(by=["tic", "date"])
+    #     stock = Sdf.retype(df.copy())
+    #     unique_ticker = stock.tic.unique()
+    #     tech_indicator_list = tech_indicator_list
+    #
+    #     for indicator in tech_indicator_list:
+    #         indicator_df = pd.DataFrame()
+    #         for i in range(len(unique_ticker)):
+    #             # print(unique_ticker[i], i)
+    #             temp_indicator = stock[stock.tic == unique_ticker[i]][indicator]
+    #             temp_indicator = pd.DataFrame(temp_indicator)
+    #             temp_indicator["tic"] = unique_ticker[i]
+    #             # print(len(df[df.tic == unique_ticker[i]]['date'].to_list()))
+    #             temp_indicator["date"] = df[df.tic == unique_ticker[i]][
+    #                 "date"
+    #             ].to_list()
+    #             indicator_df = indicator_df.append(temp_indicator, ignore_index=True)
+    #         df = df.merge(
+    #             indicator_df[["tic", "date", indicator]], on=["tic", "date"], how="left"
+    #         )
+    #     df = df.sort_values(by=["date", "tic"])
+    #     df = df.rename(columns={"date": "time"})
+    #     print("Succesfully add technical indicators")
+    #     return df
+    # def add_vix(self, data):
+    #     vix_df = self.download_data(["VIXY"], self.start, self.end, self.time_interval)
+    #     cleaned_vix = self.clean_data(vix_df)
+    #     vix = cleaned_vix[["time", "close"]]
+    #     vix = vix.rename(columns={"close": "VIXY"})
+    #
+    #     df = data.copy()
+    #     df = df.merge(vix, on="time")
+    #     df = df.sort_values(["time", "tic"]).reset_index(drop=True)
+    #     return df
+    # def calculate_turbulence(self, data, time_period=252):
+    #     # can add other market assets
+    #     df = data.copy()
+    #     df_price_pivot = df.pivot(index="date", columns="tic", values="close")
+    #     # use returns to calculate turbulence
+    #     df_price_pivot = df_price_pivot.pct_change()
+    #
+    #     unique_date = df.date.unique()
+    #     # start after a fixed time period
+    #     start = time_period
+    #     turbulence_index = [0] * start
+    #     # turbulence_index = [0]
+    #     count = 0
+    #     for i in range(start, len(unique_date)):
+    #         current_price = df_price_pivot[df_price_pivot.index == unique_date[i]]
+    #         # use one year rolling window to calcualte covariance
+    #         hist_price = df_price_pivot[
+    #             (df_price_pivot.index < unique_date[i])
+    #             & (df_price_pivot.index >= unique_date[i - time_period])
+    #         ]
+    #         # Drop tickers which has number missing values more than the "oldest" ticker
+    #         filtered_hist_price = hist_price.iloc[
+    #             hist_price.isna().sum().min() :
+    #         ].dropna(axis=1)
+    #
+    #         cov_temp = filtered_hist_price.cov()
+    #         current_temp = current_price[[x for x in filtered_hist_price]] - np.mean(
+    #             filtered_hist_price, axis=0
+    #         )
+    #         temp = current_temp.values.dot(np.linalg.pinv(cov_temp)).dot(
+    #             current_temp.values.T
+    #         )
+    #         if temp > 0:
+    #             count += 1
+    #             if count > 2:
+    #                 turbulence_temp = temp[0][0]
+    #             else:
+    #                 # avoid large outlier because of the calculation just begins
+    #                 turbulence_temp = 0
+    #         else:
+    #             turbulence_temp = 0
+    #         turbulence_index.append(turbulence_temp)
+    #
+    #     turbulence_index = pd.DataFrame(
+    #         {"date": df_price_pivot.index, "turbulence": turbulence_index}
+    #     )
+    #     return turbulence_index
+    #
+    # def add_turbulence(self, data, time_period=252):
+    #     """
+    #     add turbulence index from a precalcualted dataframe
+    #     :param data: (df) pandas dataframe
+    #     :return: (df) pandas dataframe
+    #     """
+    #     df = data.copy()
+    #     turbulence_index = self.calculate_turbulence(df, time_period=time_period)
+    #     df = df.merge(turbulence_index, on="date")
+    #     df = df.sort_values(["date", "tic"]).reset_index(drop=True)
+    #     return df
+    # def df_to_array(self, df, tech_indicator_list, if_vix):
+    #     df = df.copy()
+    #     unique_ticker = df.tic.unique()
+    #     if_first_time = True
+    #     for tic in unique_ticker:
+    #         if if_first_time:
+    #             price_array = df[df.tic == tic][["close"]].values
+    #             tech_array = df[df.tic == tic][tech_indicator_list].values
+    #             if if_vix:
+    #                 turbulence_array = df[df.tic == tic]["VIXY"].values
+    #             else:
+    #                 turbulence_array = df[df.tic == tic]["turbulence"].values
+    #             if_first_time = False
+    #         else:
+    #             price_array = np.hstack(
+    #                 [price_array, df[df.tic == tic][["close"]].values]
+    #             )
+    #             tech_array = np.hstack(
+    #                 [tech_array, df[df.tic == tic][tech_indicator_list].values]
+    #             )
+    #     print("Successfully transformed into array")
+    #     return price_array, tech_array, turbulence_array
+    def get_trading_days(self, start, end):
+        nyse = tc.get_calendar("NYSE")
+        df = nyse.sessions_in_range(
+            pd.Timestamp(start, tz=pytz.UTC), pd.Timestamp(end, tz=pytz.UTC)
+        )
+        return [str(day)[:10] for day in df]
+    def fetch_latest_data(
+        self, ticker_list, time_interval, tech_indicator_list, limit=100
+    ) -> pd.DataFrame:
+        data_df = pd.DataFrame()
+        for tic in ticker_list:
+            barset = self.api.get_barset([tic], time_interval, limit=limit).df[tic]
+            barset["tic"] = tic
+            barset = barset.reset_index()
+            data_df = data_df.append(barset)
+        data_df = data_df.reset_index(drop=True)
+        start_time = data_df.time.min()
+        end_time = data_df.time.max()
+        times = []
+        current_time = start_time
+        end = end_time + pd.Timedelta(minutes=1)
+        while current_time != end:
+            times.append(current_time)
+            current_time += pd.Timedelta(minutes=1)
+        df = data_df.copy()
+        new_df = pd.DataFrame()
+        for tic in ticker_list:
+            tmp_df = pd.DataFrame(
+                columns=["open", "high", "low", "close", "volume"], index=times
+            )
+            tic_df = df[df.tic == tic]
+            for i in range(tic_df.shape[0]):
+                tmp_df.loc[tic_df.iloc[i]["time"]] = tic_df.iloc[i][
+                    ["open", "high", "low", "close", "volume"]
+                ]
+                if str(tmp_df.iloc[0]["close"]) == "nan":
+                    for i in range(tmp_df.shape[0]):
+                        if str(tmp_df.iloc[i]["close"]) != "nan":
+                            first_valid_close = tmp_df.iloc[i]["close"]
+                            tmp_df.iloc[0] = [
+                                first_valid_close,
+                                first_valid_close,
+                                first_valid_close,
+                                first_valid_close,
+                                0.0,
+                            ]
+                            break
+                if str(tmp_df.iloc[0]["close"]) == "nan":
+                    print(
+                        "Missing data for ticker: ",
+                        tic,
+                        " . The prices are all NaN. Fill with 0.",
+                    )
+                    tmp_df.iloc[0] = [
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                        0.0,
+                    ]
+            for i in range(tmp_df.shape[0]):
+                if str(tmp_df.iloc[i]["close"]) == "nan":
+                    previous_close = tmp_df.iloc[i - 1]["close"]
+                    if str(previous_close) == "nan":
+                        raise ValueError
+                    tmp_df.iloc[i] = [
+                        previous_close,
+                        previous_close,
+                        previous_close,
+                        previous_close,
+                        0.0,
+                    ]
+            tmp_df = tmp_df.astype(float)
+            tmp_df["tic"] = tic
+            new_df = new_df.append(tmp_df)
+        new_df = new_df.reset_index()
+        new_df = new_df.rename(columns={"index": "time"})
+        df = self.add_technical_indicator(new_df, tech_indicator_list)
+        df["VIXY"] = 0
+        price_array, tech_array, turbulence_array = self.df_to_array(
+            df, tech_indicator_list, if_vix=True
+        )
+        latest_price = price_array[-1]
+        latest_tech = tech_array[-1]
+        turb_df = self.api.get_barset(["VIXY"], time_interval, limit=1).df["VIXY"]
+        latest_turb = turb_df["close"].values
+        return latest_price, latest_tech, latest_turb
+    def get_portfolio_history(self, start, end):
+        trading_days = self.get_trading_days(start, end)
+        df = pd.DataFrame()
+        for day in trading_days:
+            df = df.append(
+                self.api.get_portfolio_history(
+                    date_start=day, timeframe="5Min"
+                ).df.iloc[:79]
+            )
+        equities = df.equity.values
+        cumu_returns = equities / equities[0]
+        cumu_returns = cumu_returns[~np.isnan(cumu_returns)]
+        return cumu_returns

finnlp/data_processors/alphavantage.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import datetime
+import json
+from typing import List
+import pandas as pd
+import requests
+from meta.config import BINANCE_BASE_URL
+from meta.config import TIME_ZONE_BERLIN
+from meta.config import TIME_ZONE_JAKARTA
+from meta.config import TIME_ZONE_PARIS
+from meta.config import TIME_ZONE_SELFDEFINED
+from meta.config import TIME_ZONE_SHANGHAI
+from meta.config import TIME_ZONE_USEASTERN
+from meta.config import USE_TIME_ZONE_SELFDEFINED
+from meta.data_processors._base import _Base
+from meta.data_processors._base import calc_time_zone
+def transfer_date(d):
+    date = str(d.year)
+    date += "-"
+    if len(str(d.month)) == 1:
+        date += "0" + str(d.month)
+    else:
+        date += d.month
+    date += "-"
+    if len(str(d.day)) == 1:
+        date += "0" + str(d.day)
+    else:
+        date += str(d.day)
+    return date
+class Alphavantage(_Base):
+    def __init__(
+        self,
+        data_source: str,
+        start_date: str,
+        end_date: str,
+        time_interval: str,
+        **kwargs,
+    ):
+        super().__init__(data_source, start_date, end_date, time_interval, **kwargs)
+        assert time_interval == "1d", "please set the time_interval 1d"
+    # ["1d"]
+    def download_data(
+        self, ticker_list: List[str], save_path: str = "./data/dataset.csv"
+    ):
+        # self.time_zone = calc_time_zone(
+        #     ticker_list, TIME_ZONE_SELFDEFINED, USE_TIME_ZONE_SELFDEFINED
+        # )
+        self.dataframe = pd.DataFrame()
+        for ticker in ticker_list:
+            url = (
+                "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol="
+                + ticker
+                + "&apikey=demo"
+            )
+            r = requests.get(url)
+            data = r.json()
+            data2 = json.dumps(data["Time Series (Daily)"])
+            # gnData = json.dumps(data["Data"]["gn"])
+            df2 = pd.read_json(data2)
+            # gnDf = pd.read_json(gnData)
+            df3 = pd.DataFrame(df2.values.T, columns=df2.index, index=df2.columns)
+            df3.rename(
+                columns={
+                    "1. open": "open",
+                    "2. high": "high",
+                    "3. low": "low",
+                    "4. close": "close",
+                    "5. volume": "volume",
+                },
+                inplace=True,
+            )
+            df3["tic"] = ticker
+            dates = [transfer_date(df2.index[i]) for i in range(len(df2.index))]
+            df3["date"] = dates
+            self.dataframe = pd.concat([self.dataframe, df3])
+        self.dataframe = self.dataframe.sort_values(by=["date", "tic"]).reset_index(
+            drop=True
+        )
+        self.save_data(save_path)
+        print(
+            f"Download complete! Dataset saved to {save_path}. \nShape of DataFrame: {self.dataframe.shape}"
+        )

finnlp/data_processors/baostock.py ADDED Viewed

	@@ -0,0 +1,114 @@

+from typing import List
+import baostock as bs
+import numpy as np
+import pandas as pd
+import pytz
+import yfinance as yf
+"""Reference: https://github.com/AI4Finance-LLC/FinRL"""
+try:
+    import exchange_calendars as tc
+except:
+    print(
+        "Cannot import exchange_calendars.",
+        "If you are using python>=3.7, please install it.",
+    )
+    import trading_calendars as tc
+    print("Use trading_calendars instead for yahoofinance processor..")
+# from basic_processor import _Base
+from meta.data_processors._base import _Base
+from meta.data_processors._base import calc_time_zone
+from meta.config import (
+    TIME_ZONE_SHANGHAI,
+    TIME_ZONE_USEASTERN,
+    TIME_ZONE_PARIS,
+    TIME_ZONE_BERLIN,
+    TIME_ZONE_JAKARTA,
+    TIME_ZONE_SELFDEFINED,
+    USE_TIME_ZONE_SELFDEFINED,
+    BINANCE_BASE_URL,
+)
+class Baostock(_Base):
+    def __init__(
+        self,
+        data_source: str,
+        start_date: str,
+        end_date: str,
+        time_interval: str,
+        **kwargs,
+    ):
+        super().__init__(data_source, start_date, end_date, time_interval, **kwargs)
+    # 日k线、周k线、月k线，以及5分钟、15分钟、30分钟和60分钟k线数据
+    # ["5m", "15m", "30m", "60m", "1d", "1w", "1M"]
+    def download_data(
+        self, ticker_list: List[str], save_path: str = "./data/dataset.csv"
+    ):
+        lg = bs.login()
+        print("baostock login respond error_code:" + lg.error_code)
+        print("baostock login respond  error_msg:" + lg.error_msg)
+        self.time_zone = calc_time_zone(
+            ticker_list, TIME_ZONE_SELFDEFINED, USE_TIME_ZONE_SELFDEFINED
+        )
+        self.dataframe = pd.DataFrame()
+        for ticker in ticker_list:
+            nonstandrad_ticker = self.transfer_standard_ticker_to_nonstandard(ticker)
+            # All supported: "date,code,open,high,low,close,preclose,volume,amount,adjustflag,turn,tradestatus,pctChg,isST"
+            rs = bs.query_history_k_data_plus(
+                nonstandrad_ticker,
+                "date,code,open,high,low,close,volume",
+                start_date=self.start_date,
+                end_date=self.end_date,
+                frequency=self.time_interval,
+                adjustflag="3",
+            )
+            print("baostock download_data respond error_code:" + rs.error_code)
+            print("baostock download_data respond  error_msg:" + rs.error_msg)
+            data_list = []
+            while (rs.error_code == "0") & rs.next():
+                data_list.append(rs.get_row_data())
+            df = pd.DataFrame(data_list, columns=rs.fields)
+            df.loc[:, "code"] = [ticker] * df.shape[0]
+            self.dataframe = pd.concat([self.dataframe, df])
+        self.dataframe = self.dataframe.sort_values(by=["date", "code"]).reset_index(
+            drop=True
+        )
+        bs.logout()
+        self.dataframe.open = self.dataframe.open.astype(float)
+        self.dataframe.high = self.dataframe.high.astype(float)
+        self.dataframe.low = self.dataframe.low.astype(float)
+        self.dataframe.close = self.dataframe.close.astype(float)
+        self.save_data(save_path)
+        print(
+            f"Download complete! Dataset saved to {save_path}. \nShape of DataFrame: {self.dataframe.shape}"
+        )
+    def get_trading_days(self, start, end):
+        lg = bs.login()
+        print("baostock login respond error_code:" + lg.error_code)
+        print("baostock login respond  error_msg:" + lg.error_msg)
+        result = bs.query_trade_dates(start_date=start, end_date=end)
+        bs.logout()
+        return result
+    # "600000.XSHG" -> "sh.600000"
+    # "000612.XSHE" -> "sz.000612"
+    def transfer_standard_ticker_to_nonstandard(self, ticker: str) -> str:
+        n, alpha = ticker.split(".")
+        assert alpha in ["XSHG", "XSHE"], "Wrong alpha"
+        if alpha == "XSHG":
+            nonstandard_ticker = "sh." + n
+        elif alpha == "XSHE":
+            nonstandard_ticker = "sz." + n
+        return nonstandard_ticker

finnlp/data_processors/binance.py ADDED Viewed

	@@ -0,0 +1,434 @@

+import datetime as dt
+import json
+import os
+import urllib
+import zipfile
+from datetime import *
+from pathlib import Path
+from typing import List
+import pandas as pd
+import requests
+from meta.config import BINANCE_BASE_URL
+from meta.config import TIME_ZONE_BERLIN
+from meta.config import TIME_ZONE_JAKARTA
+from meta.config import TIME_ZONE_PARIS
+from meta.config import TIME_ZONE_SELFDEFINED
+from meta.config import TIME_ZONE_SHANGHAI
+from meta.config import TIME_ZONE_USEASTERN
+from meta.config import USE_TIME_ZONE_SELFDEFINED
+from meta.data_processors._base import _Base
+from meta.data_processors._base import check_date
+# from _base import check_date
+class Binance(_Base):
+    def __init__(
+        self,
+        data_source: str,
+        start_date: str,
+        end_date: str,
+        time_interval: str,
+        **kwargs,
+    ):
+        if time_interval == "1D":
+            raise ValueError("Please use the time_interval 1d instead of 1D")
+        if time_interval == "1d":
+            check_date(start_date)
+            check_date(end_date)
+        super().__init__(data_source, start_date, end_date, time_interval, **kwargs)
+        self.url = "https://api.binance.com/api/v3/klines"
+        self.time_diff = None
+    # main functions
+    def download_data(
+        self, ticker_list: List[str], save_path: str = "./data/dataset.csv"
+    ):
+        startTime = dt.datetime.strptime(self.start_date, "%Y-%m-%d")
+        endTime = dt.datetime.strptime(self.end_date, "%Y-%m-%d")
+        self.start_time = self.stringify_dates(startTime)
+        self.end_time = self.stringify_dates(endTime)
+        self.interval = self.time_interval
+        self.limit = 1440
+        # 1s for now, will add support for variable time and variable tick soon
+        if self.time_interval == "1s":
+            # as per https://binance-docs.github.io/apidocs/spot/en/#compressed-aggregate-trades-list
+            self.limit = 1000
+            final_df = self.fetch_n_combine(self.start_date, self.end_date, ticker_list)
+        else:
+            final_df = pd.DataFrame()
+            for i in ticker_list:
+                hist_data = self.dataframe_with_limit(symbol=i)
+                df = hist_data.iloc[:-1].dropna()
+                df["tic"] = i
+                final_df = pd.concat([final_df, df], axis=0, join="outer")
+        self.dataframe = final_df
+        self.save_data(save_path)
+        print(
+            f"Download complete! Dataset saved to {save_path}. \nShape of DataFrame: {self.dataframe.shape}"
+        )
+    # def clean_data(self, df):
+    #     df = df.dropna()
+    #     return df
+    # def add_technical_indicator(self, df, tech_indicator_list):
+    #     print('Adding self-defined technical indicators is NOT supported yet.')
+    #     print('Use default: MACD, RSI, CCI, DX.')
+    #     self.tech_indicator_list = ['open', 'high', 'low', 'close', 'volume',
+    #                                 'macd', 'macd_signal', 'macd_hist',
+    #                                 'rsi', 'cci', 'dx']
+    #     final_df = pd.DataFrame()
+    #     for i in df.tic.unique():
+    #         tic_df = df[df.tic==i]
+    #         tic_df['macd'], tic_df['macd_signal'], tic_df['macd_hist'] = MACD(tic_df['close'], fastperiod=12,
+    #                                                                             slowperiod=26, signalperiod=9)
+    #         tic_df['rsi'] = RSI(tic_df['close'], timeperiod=14)
+    #         tic_df['cci'] = CCI(tic_df['high'], tic_df['low'], tic_df['close'], timeperiod=14)
+    #         tic_df['dx'] = DX(tic_df['high'], tic_df['low'], tic_df['close'], timeperiod=14)
+    #         final_df = final_df.append(tic_df)
+    #
+    #     return final_df
+    # def add_turbulence(self, df):
+    #     print('Turbulence not supported yet. Return original DataFrame.')
+    #
+    #     return df
+    # def add_vix(self, df):
+    #     print('VIX is not applicable for cryptocurrencies. Return original DataFrame')
+    #
+    #     return df
+    # def df_to_array(self, df, tech_indicator_list, if_vix):
+    #     unique_ticker = df.tic.unique()
+    #     price_array = np.column_stack([df[df.tic==tic].close for tic in unique_ticker])
+    #     tech_array = np.hstack([df.loc[(df.tic==tic), tech_indicator_list] for tic in unique_ticker])
+    #     assert price_array.shape[0] == tech_array.shape[0]
+    #     return price_array, tech_array, np.array([])
+    # helper functions
+    def stringify_dates(self, date: dt.datetime):
+        return str(int(date.timestamp() * 1000))
+    def get_binance_bars(self, last_datetime, symbol):
+        """
+        klines api returns data in the following order:
+        open_time, open_price, high_price, low_price, close_price,
+        volume, close_time, quote_asset_volume, n_trades,
+        taker_buy_base_asset_volume, taker_buy_quote_asset_volume,
+        ignore
+        """
+        req_params = {
+            "symbol": symbol,
+            "interval": self.interval,
+            "startTime": last_datetime,
+            "endTime": self.end_time,
+            "limit": self.limit,
+        }
+        # For debugging purposes, uncomment these lines and if they throw an error
+        # then you may have an error in req_params
+        # r = requests.get(self.url, params=req_params)
+        # print(r.text)
+        df = pd.DataFrame(requests.get(self.url, params=req_params).json())
+        if df.empty:
+            return None
+        df = df.iloc[:, 0:6]
+        df.columns = ["datetime", "open", "high", "low", "close", "volume"]
+        df[["open", "high", "low", "close", "volume"]] = df[
+            ["open", "high", "low", "close", "volume"]
+        ].astype(float)
+        # No stock split and dividend announcement, hence adjusted close is the same as close
+        df["adjusted_close"] = df["close"]
+        df["datetime"] = df.datetime.apply(
+            lambda x: dt.datetime.fromtimestamp(x / 1000.0)
+        )
+        df.reset_index(drop=True, inplace=True)
+        return df
+    def get_newest_bars(self, symbols, interval, limit):
+        merged_df = pd.DataFrame()
+        for symbol in symbols:
+            req_params = {
+                "symbol": symbol,
+                "interval": interval,
+                "limit": limit,
+            }
+            df = pd.DataFrame(
+                requests.get(self.url, params=req_params).json(),
+                index=range(limit),
+            )
+            if df.empty:
+                return None
+            df = df.iloc[:, 0:6]
+            df.columns = ["datetime", "open", "high", "low", "close", "volume"]
+            df[["open", "high", "low", "close", "volume"]] = df[
+                ["open", "high", "low", "close", "volume"]
+            ].astype(float)
+            # No stock split and dividend announcement, hence adjusted close is the same as close
+            df["adjusted_close"] = df["close"]
+            df["datetime"] = df.datetime.apply(
+                lambda x: dt.datetime.fromtimestamp(x / 1000.0)
+            )
+            df["tic"] = symbol
+            df = df.rename(columns={"datetime": "time"})
+            df.reset_index(drop=True, inplace=True)
+            merged_df = merged_df.append(df)
+        return merged_df
+    def dataframe_with_limit(self, symbol):
+        final_df = pd.DataFrame()
+        last_datetime = self.start_time
+        while True:
+            new_df = self.get_binance_bars(last_datetime, symbol)
+            if new_df is None:
+                break
+            if last_datetime == self.end_time:
+                break
+            final_df = pd.concat([final_df, new_df], axis=0, join="outer")
+            # last_datetime = max(new_df.datetime) + dt.timedelta(days=1)
+            last_datetime = max(new_df.datetime)
+            if isinstance(last_datetime, pd.Timestamp):
+                last_datetime = last_datetime.to_pydatetime()
+            if self.time_diff == None:
+                self.time_diff = new_df.loc[1]["datetime"] - new_df.loc[0]["datetime"]
+            last_datetime = last_datetime + self.time_diff
+            last_datetime = self.stringify_dates(last_datetime)
+        date_value = final_df["datetime"].apply(
+            lambda x: x.strftime("%Y-%m-%d %H:%M:%S")
+        )
+        final_df.insert(0, "time", date_value)
+        final_df.drop("datetime", inplace=True, axis=1)
+        return final_df
+    def get_download_url(self, file_url):
+        return f"{BINANCE_BASE_URL}{file_url}"
+    # downloads zip, unzips zip and deltes zip
+    def download_n_unzip_file(self, base_path, file_name, date_range=None):
+        download_path = f"{base_path}{file_name}"
+        if date_range:
+            date_range = date_range.replace(" ", "_")
+            base_path = os.path.join(base_path, date_range)
+        # raw_cache_dir = get_destination_dir("./cache/tick_raw")
+        raw_cache_dir = "./cache/tick_raw"
+        zip_save_path = os.path.join(raw_cache_dir, file_name)
+        csv_name = os.path.splitext(file_name)[0] + ".csv"
+        csv_save_path = os.path.join(raw_cache_dir, csv_name)
+        fhandles = []
+        if os.path.exists(csv_save_path):
+            print(f"\nfile already exists! {csv_save_path}")
+            return [csv_save_path]
+        # make the "cache" directory (only)
+        if not os.path.exists(raw_cache_dir):
+            Path(raw_cache_dir).mkdir(parents=True, exist_ok=True)
+        try:
+            download_url = self.get_download_url(download_path)
+            dl_file = urllib.request.urlopen(download_url)
+            length = dl_file.getheader("content-length")
+            if length:
+                length = int(length)
+                blocksize = max(4096, length // 100)
+            with open(zip_save_path, "wb") as out_file:
+                dl_progress = 0
+                print(f"\nFile Download: {zip_save_path}")
+                while True:
+                    buf = dl_file.read(blocksize)
+                    if not buf:
+                        break
+                    out_file.write(buf)
+                    # visuals
+                    # dl_progress += len(buf)
+                    # done = int(50 * dl_progress / length)
+                    # sys.stdout.write("\r[%s%s]" % ('#' * done, '.' * (50-done)) )
+                    # sys.stdout.flush()
+            # unzip and delete zip
+            file = zipfile.ZipFile(zip_save_path)
+            with zipfile.ZipFile(zip_save_path) as zip:
+                # guaranteed just 1 csv
+                csvpath = zip.extract(zip.namelist()[0], raw_cache_dir)
+                fhandles.append(csvpath)
+            os.remove(zip_save_path)
+            return fhandles
+        except urllib.error.HTTPError:
+            print(f"\nFile not found: {download_url}")
+    def convert_to_date_object(self, d):
+        year, month, day = [int(x) for x in d.split("-")]
+        return date(year, month, day)
+    def get_path(
+        self,
+        trading_type,
+        market_data_type,
+        time_period,
+        symbol,
+        interval=None,
+    ):
+        trading_type_path = "data/spot"
+        # currently just supporting spot
+        if trading_type != "spot":
+            trading_type_path = f"data/futures/{trading_type}"
+        return (
+            f"{trading_type_path}/{time_period}/{market_data_type}/{symbol.upper()}/{interval}/"
+            if interval is not None
+            else f"{trading_type_path}/{time_period}/{market_data_type}/{symbol.upper()}/"
+        )
+    # helpers for manipulating tick level data (1s intervals)
+    def download_daily_aggTrades(
+        self, symbols, num_symbols, dates, start_date, end_date
+    ):
+        trading_type = "spot"
+        date_range = start_date + " " + end_date
+        start_date = self.convert_to_date_object(start_date)
+        end_date = self.convert_to_date_object(end_date)
+        print(f"Found {num_symbols} symbols")
+        map = {}
+        for current, symbol in enumerate(symbols):
+            map[symbol] = []
+            print(
+                f"[{current + 1}/{num_symbols}] - start download daily {symbol} aggTrades "
+            )
+            for date in dates:
+                current_date = self.convert_to_date_object(date)
+                if current_date >= start_date and current_date <= end_date:
+                    path = self.get_path(trading_type, "aggTrades", "daily", symbol)
+                    file_name = f"{symbol.upper()}-aggTrades-{date}.zip"
+                    fhandle = self.download_n_unzip_file(path, file_name, date_range)
+                    map[symbol] += fhandle
+        return map
+    def fetch_aggTrades(self, startDate: str, endDate: str, tickers: List[str]):
+        # all valid symbols traded on v3 api
+        response = urllib.request.urlopen(
+            "https://api.binance.com/api/v3/exchangeInfo"
+        ).read()
+        valid_symbols = list(
+            map(
+                lambda symbol: symbol["symbol"],
+                json.loads(response)["symbols"],
+            )
+        )
+        for tic in tickers:
+            if tic not in valid_symbols:
+                print(tic + " not a valid ticker, removing from download")
+        tickers = list(set(tickers) & set(valid_symbols))
+        num_symbols = len(tickers)
+        # not adding tz yet
+        # for ffill missing data on starting on first day 00:00:00 (if any)
+        tminus1 = (self.convert_to_date_object(startDate) - dt.timedelta(1)).strftime(
+            "%Y-%m-%d"
+        )
+        dates = pd.date_range(start=tminus1, end=endDate)
+        dates = [date.strftime("%Y-%m-%d") for date in dates]
+        return self.download_daily_aggTrades(
+            tickers, num_symbols, dates, tminus1, endDate
+        )
+    # Dict[str]:List[str] -> pd.DataFrame
+    def combine_raw(self, map):
+        # same format as jingyang's current data format
+        final_df = pd.DataFrame()
+        # using AggTrades with headers from https://github.com/binance/binance-public-data/
+        colNames = [
+            "AggregatetradeId",
+            "Price",
+            "volume",
+            "FirsttradeId",
+            "LasttradeId",
+            "time",
+            "buyerWasMaker",
+            "tradeWasBestPriceMatch",
+        ]
+        for tic in map.keys():
+            security = pd.DataFrame()
+            for i, csv in enumerate(map[tic]):
+                dailyticks = pd.read_csv(
+                    csv,
+                    names=colNames,
+                    index_col=["time"],
+                    parse_dates=["time"],
+                    date_parser=lambda epoch: pd.to_datetime(epoch, unit="ms"),
+                )
+                dailyfinal = dailyticks.resample("1s").agg(
+                    {"Price": "ohlc", "volume": "sum"}
+                )
+                dailyfinal.columns = dailyfinal.columns.droplevel(0)
+                # favor continuous series
+                # dailyfinal.dropna(inplace=True)
+                # implemented T-1 day ffill day start missing values
+                # guaranteed first csv is tminus1 day
+                if i == 0:
+                    tmr = dailyfinal.index[0].date() + dt.timedelta(1)
+                    tmr_dt = dt.datetime.combine(tmr, dt.time.min)
+                    last_time_stamp_dt = dailyfinal.index[-1].to_pydatetime()
+                    s_delta = (tmr_dt - last_time_stamp_dt).seconds
+                    lastsample = dailyfinal.iloc[-1:]
+                    lastsample.index = lastsample.index.shift(s_delta, "s")
+                else:
+                    day_dt = dailyfinal.index[0].date()
+                    day_str = day_dt.strftime("%Y-%m-%d")
+                    nextday_str = (day_dt + dt.timedelta(1)).strftime("%Y-%m-%d")
+                    if dailyfinal.index[0].second != 0:
+                        # append last sample
+                        dailyfinal = lastsample.append(dailyfinal)
+                    # otherwise, just reindex and ffill
+                    dailyfinal = dailyfinal.reindex(
+                        pd.date_range(day_str, nextday_str, freq="1s")[:-1],
+                        method="ffill",
+                    )
+                    # save reference info (guaranteed to be :59)
+                    lastsample = dailyfinal.iloc[-1:]
+                    lastsample.index = lastsample.index.shift(1, "s")
+                    if dailyfinal.shape[0] != 86400:
+                        raise ValueError("everyday should have 86400 datapoints")
+                    # only save real startDate - endDate
+                    security = security.append(dailyfinal)
+            security.ffill(inplace=True)
+            security["tic"] = tic
+            final_df = final_df.append(security)
+        return final_df
+    def fetch_n_combine(self, startDate, endDate, tickers):
+        # return combine_raw(fetchAggTrades(startDate, endDate, tickers))
+        mapping = self.fetch_aggTrades(startDate, endDate, tickers)
+        return self.combine_raw(mapping)

finnlp/data_processors/ccxt.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import calendar
+from datetime import datetime
+from typing import List
+import ccxt
+import numpy as np
+import pandas as pd
+from meta.data_processors._base import _Base
+# from basic_processor import _Base
+class Ccxt(_Base):
+    def __init__(
+        self,
+        data_source: str,
+        start_date: str,
+        end_date: str,
+        time_interval: str,
+        **kwargs,
+    ):
+        super().__init__(data_source, start_date, end_date, time_interval, **kwargs)
+        self.binance = ccxt.binance()
+    def download_data(
+        self, ticker_list: List[str], save_path: str = "./data/dataset.csv"
+    ):
+        crypto_column = pd.MultiIndex.from_product(
+            [ticker_list, ["open", "high", "low", "close", "volume"]]
+        )
+        first_time = True
+        for ticker in ticker_list:
+            start_dt = datetime.strptime(self.start_date, "%Y%m%d %H:%M:%S")
+            end_dt = datetime.strptime(self.end_date, "%Y%m%d %H:%M:%S")
+            start_timestamp = calendar.timegm(start_dt.utctimetuple())
+            end_timestamp = calendar.timegm(end_dt.utctimetuple())
+            if self.time_interval == "1Min":
+                date_list = [
+                    datetime.utcfromtimestamp(float(time))
+                    for time in range(start_timestamp, end_timestamp, 60 * 720)
+                ]
+            else:
+                date_list = [
+                    datetime.utcfromtimestamp(float(time))
+                    for time in range(start_timestamp, end_timestamp, 60 * 1440)
+                ]
+            df = self.ohlcv(date_list, ticker, self.time_interval)
+            if first_time:
+                dataset = pd.DataFrame(columns=crypto_column, index=df["time"].values)
+                first_time = False
+            temp_col = pd.MultiIndex.from_product(
+                [[ticker], ["open", "high", "low", "close", "volume"]]
+            )
+            dataset[temp_col] = df[["open", "high", "low", "close", "volume"]].values
+        print("Actual end time: " + str(df["time"].values[-1]))
+        self.dataframe = dataset
+        self.save_data(save_path)
+        print(
+            f"Download complete! Dataset saved to {save_path}. \nShape of DataFrame: {self.dataframe.shape}"
+        )
+    # def add_technical_indicators(self, df, pair_list, tech_indicator_list = [
+    #     'macd', 'boll_ub', 'boll_lb', 'rsi_30', 'dx_30',
+    #     'close_30_sma', 'close_60_sma']):
+    #     df = df.dropna()
+    #     df = df.copy()
+    #     column_list = [pair_list, ['open','high','low','close','volume']+(tech_indicator_list)]
+    #     column = pd.MultiIndex.from_product(column_list)
+    #     index_list = df.index
+    #     dataset = pd.DataFrame(columns=column,index=index_list)
+    #     for pair in pair_list:
+    #         pair_column = pd.MultiIndex.from_product([[pair],['open','high','low','close','volume']])
+    #         dataset[pair_column] = df[pair]
+    #         temp_df = df[pair].reset_index().sort_values(by=['index'])
+    #         temp_df = temp_df.rename(columns={'index':'date'})
+    #         crypto_df = Sdf.retype(temp_df.copy())
+    #         for indicator in tech_indicator_list:
+    #             temp_indicator = crypto_df[indicator].values.tolist()
+    #             dataset[(pair,indicator)] = temp_indicator
+    #     print('Succesfully add technical indicators')
+    #     return dataset
+    def df_to_ary(self, pair_list, tech_indicator_list=None):
+        if tech_indicator_list is None:
+            tech_indicator_list = [
+                "macd",
+                "boll_ub",
+                "boll_lb",
+                "rsi_30",
+                "dx_30",
+                "close_30_sma",
+                "close_60_sma",
+            ]
+        df = self.dataframe
+        df = df.dropna()
+        date_ary = df.index.values
+        price_array = df[pd.MultiIndex.from_product([pair_list, ["close"]])].values
+        tech_array = df[
+            pd.MultiIndex.from_product([pair_list, tech_indicator_list])
+        ].values
+        return price_array, tech_array, date_ary
+    def min_ohlcv(self, dt, pair, limit):
+        since = calendar.timegm(dt.utctimetuple()) * 1000
+        return self.binance.fetch_ohlcv(
+            symbol=pair, timeframe="1m", since=since, limit=limit
+        )
+    def ohlcv(self, dt, pair, period="1d"):
+        ohlcv = []
+        limit = 1000
+        if period == "1Min":
+            limit = 720
+        elif period == "1D":
+            limit = 1
+        elif period == "1H":
+            limit = 24
+        elif period == "5Min":
+            limit = 288
+        for i in dt:
+            start_dt = i
+            since = calendar.timegm(start_dt.utctimetuple()) * 1000
+            if period == "1Min":
+                ohlcv.extend(self.min_ohlcv(start_dt, pair, limit))
+            else:
+                ohlcv.extend(
+                    self.binance.fetch_ohlcv(
+                        symbol=pair, timeframe=period, since=since, limit=limit
+                    )
+                )
+        df = pd.DataFrame(
+            ohlcv, columns=["time", "open", "high", "low", "close", "volume"]
+        )
+        df["time"] = [datetime.fromtimestamp(float(time) / 1000) for time in df["time"]]
+        df["open"] = df["open"].astype(np.float64)
+        df["high"] = df["high"].astype(np.float64)
+        df["low"] = df["low"].astype(np.float64)
+        df["close"] = df["close"].astype(np.float64)
+        df["volume"] = df["volume"].astype(np.float64)
+        return df

finnlp/data_processors/fx.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import os
+import sys
+import pandas as pd
+from finta import TA
+def add_time_feature(df, symbol, dt_col_name="time"):
+    """read csv into df and index on time
+    dt_col_name can be any unit from minutes to day. time is the index of pd
+    must have pd columns [(time_col),(asset_col), Open,close,High,Low,day]
+    data_process will add additional time information: time(index), minute, hour, weekday, week, month,year, day(since 1970)
+    use StopLoss and ProfitTaken to simplify the action,
+    feed a fixed StopLoss (SL = 200) and PT = SL * ratio
+    action space: [action[0,2],ratio[0,10]]
+    rewards is point
+    add hourly, dayofweek(0-6, Sun-Sat)
+    Args:
+        file (str): file path/name.csv
+    """
+    df["symbol"] = symbol
+    df["dt"] = pd.to_datetime(df[dt_col_name])
+    df.index = df["dt"]
+    df["minute"] = df["dt"].dt.minute
+    df["hour"] = df["dt"].dt.hour
+    df["weekday"] = df["dt"].dt.dayofweek
+    df["week"] = df["dt"].dt.isocalendar().week
+    df["month"] = df["dt"].dt.month
+    df["year"] = df["dt"].dt.year
+    df["day"] = df["dt"].dt.day
+    # df = df.set_index('dt')
+    return df
+# 'macd', 'boll_ub', 'boll_lb', 'rsi_30', 'dx_30','close_30_sma', 'close_60_sma'
+def tech_indictors(df):
+    df["macd"] = TA.MACD(df).SIGNAL
+    df["boll_ub"] = TA.BBANDS(df).BB_UPPER
+    df["boll_lb"] = TA.BBANDS(df).BB_LOWER
+    df["rsi_30"] = TA.RSI(df, period=30)
+    df["dx_30"] = TA.ADX(df, period=30)
+    df["close_30_sma"] = TA.SMA(df, period=30)
+    df["close_60_sma"] = TA.SMA(df, period=60)
+    # fill NaN to 0
+    df = df.fillna(0)
+    print(
+        f"--------df head - tail ----------------\n{df.head(3)}\n{df.tail(3)}\n---------------------------------"
+    )
+    return df
+def split_timeserious(df, key_ts="dt", freq="W", symbol=""):
+    """import df and split into hour, daily, weekly, monthly based and
+    save into subfolder
+    Args:
+        df (pandas df with timestamp is part of multi index):
+        spliter (str): H, D, W, M, Y
+    """
+    freq_name = {
+        "H": "hourly",
+        "D": "daily",
+        "W": "weekly",
+        "M": "monthly",
+        "Y": "Yearly",
+    }
+    for count, (n, g) in enumerate(df.groupby(pd.Grouper(level=key_ts, freq=freq))):
+        p = f"./data/split/{symbol}/{freq_name[freq]}"
+        os.makedirs(p, exist_ok=True)
+        # fname = f'{symbol}_{n:%Y%m%d}_{freq}_{count}.csv'
+        fname = f"{symbol}_{n:%Y}_{count}.csv"
+        fn = f"{p}/{fname}"
+        print(f"save to:{fn}")
+        g.reset_index(drop=True, inplace=True)
+        g.drop(columns=["dt"], inplace=True)
+        g.to_csv(fn)
+    return
+"""
+python ./neo_finrl/data_processors/fx.py GBPUSD W ./data/raw/GBPUSD_raw.csv
+symbol="GBPUSD"
+freq = [H, D, W, M]
+file .csv, column names [time, Open, High, Low, Close, Vol]
+"""
+if __name__ == "__main__":
+    symbol, freq, file = sys.argv[1], sys.argv[2], sys.argv[3]
+    print(f"processing... symbol:{symbol} freq:{freq} file:{file}")
+    try:
+        df = pd.read_csv(file)
+    except Exception:
+        print(f"No such file or directory: {file}")
+        exit(0)
+    df = add_time_feature(df, symbol=symbol, dt_col_name="time")
+    df = tech_indictors(df)
+    split_timeserious(df, freq=freq, symbol=symbol)
+    print(f"Done!")

finnlp/data_processors/iexcloud.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os
+from datetime import datetime
+from typing import List
+import pandas as pd
+import pandas_market_calendars as mcal
+import pytz
+import requests
+from meta.data_processors._base import _Base
+# from _base import _Base
+class Iexcloud(_Base):
+    @classmethod
+    def _get_base_url(self, mode: str) -> str:
+        as1 = "mode must be sandbox or production."
+        assert mode in {"sandbox", "production"}, as1
+        if mode == "sandbox":
+            return "https://sandbox.iexapis.com"
+        return "https://cloud.iexapis.com"
+    def __init__(
+        self,
+        data_source: str,
+        start_date: str,
+        end_date: str,
+        time_interval: str,
+        **kwargs,
+    ):
+        super().__init__(data_source, start_date, end_date, time_interval, **kwargs)
+        self.base_url = self._get_base_url(mode=kwargs["mode"])
+        self.token = kwargs["token"] or os.environ.get("IEX_TOKEN")
+    def download_data(
+        self, ticker_list: List[str], save_path: str = "./data/dataset.csv"
+    ):
+        """Returns end of day historical data for up to 15 years.
+        Args:
+            ticker_list (List[str]): List of the tickers to retrieve information.
+            start_date (str): Oldest date of the range.
+            end_date (str): Latest date of the range.
+        Returns:
+            pd.DataFrame: A pandas dataframe with end of day historical data
+            for the specified tickers with the following columns:
+            date, tic, open, high, low, close, adjusted_close, volume.
+        Examples:
+            kwargs['mode'] = 'sandbox'
+            kwargs['token'] = 'Tsk_d633e2ff10d463...'
+            >>> iex_dloader = Iexcloud(data_source='iexcloud', **kwargs)
+            >>> iex_dloader.download_data(ticker_list=["AAPL", "NVDA"],
+                                        start_date='2014-01-01',
+                                        end_date='2021-12-12',
+                                        time_interval = '1D')
+        """
+        assert self.time_interval == "1D"  # one day
+        price_data = pd.DataFrame()
+        query_params = {
+            "token": self.token,
+        }
+        if self.start_date and self.end_date:
+            query_params["from"] = self.start_date
+            query_params["to"] = self.end_date
+        for stock in ticker_list:
+            end_point = f"{self.base_url}/stable/time-series/HISTORICAL_PRICES/{stock}"
+            response = requests.get(
+                url=end_point,
+                params=query_params,
+            )
+            if response.status_code != 200:
+                raise requests.exceptions.RequestException(response.text)
+            temp = pd.DataFrame.from_dict(data=response.json())
+            temp["ticker"] = stock
+            price_data = price_data.append(temp)
+        price_data = price_data[
+            [
+                "date",
+                "ticker",
+                "open",
+                "high",
+                "low",
+                "close",
+                "fclose",
+                "volume",
+            ]
+        ]
+        price_data = price_data.rename(
+            columns={
+                "ticker": "tic",
+                "date": "time",
+                "fclose": "adjusted_close",
+            }
+        )
+        price_data.date = price_data.date.map(
+            lambda x: datetime.fromtimestamp(x / 1000, pytz.UTC).strftime("%Y-%m-%d")
+        )
+        self.dataframe = price_data
+        self.save_data(save_path)
+        print(
+            f"Download complete! Dataset saved to {save_path}. \nShape of DataFrame: {self.dataframe.shape}"
+        )
+    def get_trading_days(self, start: str, end: str) -> List[str]:
+        """Retrieves every trading day between two dates.
+        Args:
+            start (str): Oldest date of the range.
+            end (str): Latest date of the range.
+        Returns:
+            List[str]: List of all trading days in YYYY-dd-mm format.
+        Examples:
+            >>> iex_dloader = Iexcloud(data_source='iexcloud',
+                                                mode='sandbox',
+                                                token='Tsk_d633e2ff10d463...')
+            >>> iex_dloader.get_trading_days(start='2014-01-01',
+                                             end='2021-12-12')
+            ['2021-12-15', '2021-12-16', '2021-12-17']
+        """
+        nyse = mcal.get_calendar("NYSE")
+        df = nyse.schedule(
+            start_date=pd.Timestamp(start, tz=pytz.UTC),
+            end_date=pd.Timestamp(end, tz=pytz.UTC),
+        )
+        return df.applymap(lambda x: x.strftime("%Y-%m-%d")).market_open.to_list()

finnlp/data_processors/joinquant.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import copy
+import datetime
+import os
+from typing import List
+import jqdatasdk as jq
+import numpy as np
+from meta.data_processors._base import _Base
+class Joinquant(_Base):
+    def __init__(
+        self,
+        data_source: str,
+        start_date: str,
+        end_date: str,
+        time_interval: str,
+        **kwargs,
+    ):
+        super().__init__(data_source, start_date, end_date, time_interval, **kwargs)
+        if "username" in kwargs.keys() and "password" in kwargs.keys():
+            jq.auth(kwargs["username"], kwargs["password"])
+    def download_data(
+        self, ticker_list: List[str], save_path: str = "./data/dataset.csv"
+    ):
+        # joinquant supports: '1m', '5m', '15m', '30m', '60m', '120m', '1d', '1w', '1M'。'1w' denotes one week，‘1M' denotes one month。
+        count = len(self.get_trading_days(self.start_date, self.end_date))
+        df = jq.get_bars(
+            security=ticker_list,
+            count=count,
+            unit=self.time_interval,
+            fields=["date", "open", "high", "low", "close", "volume"],
+            end_dt=self.end_date,
+        )
+        df = df.reset_index().rename(columns={"level_0": "tic"})
+        self.dataframe = df
+        self.save_data(save_path)
+        print(
+            f"Download complete! Dataset saved to {save_path}. \nShape of DataFrame: {self.dataframe.shape}"
+        )
+    def preprocess(df, stock_list):
+        n = len(stock_list)
+        N = df.shape[0]
+        assert N % n == 0
+        d = int(N / n)
+        stock1_ary = df.iloc[0:d, 1:].values
+        temp_ary = stock1_ary
+        for j in range(1, n):
+            stocki_ary = df.iloc[j * d : (j + 1) * d, 1:].values
+            temp_ary = np.hstack((temp_ary, stocki_ary))
+        return temp_ary
+    # start_day: str
+    # end_day: str
+    # output: list of str_of_trade_day, e.g., ['2021-09-01', '2021-09-02']
+    def get_trading_days(self, start_day: str, end_day: str) -> List[str]:
+        dates = jq.get_trade_days(start_day, end_day)
+        str_dates = []
+        for d in dates:
+            tmp = datetime.date.strftime(d, "%Y-%m-%d")
+            str_dates.append(tmp)
+        # str_dates = [date2str(dt) for dt in dates]
+        return str_dates

finnlp/data_processors/quandl.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from typing import List
+import numpy as np
+import pandas as pd
+import pytz
+import quandl
+import yfinance as yf
+"""Reference: https://github.com/AI4Finance-LLC/FinRL"""
+try:
+    import exchange_calendars as tc
+except:
+    print(
+        "Cannot import exchange_calendars.",
+        "If you are using python>=3.7, please install it.",
+    )
+    import trading_calendars as tc
+    print("Use trading_calendars instead for yahoofinance processor..")
+# from basic_processor import _Base
+from meta.data_processors._base import _Base
+from meta.data_processors._base import calc_time_zone
+from meta.config import (
+    TIME_ZONE_SHANGHAI,
+    TIME_ZONE_USEASTERN,
+    TIME_ZONE_PARIS,
+    TIME_ZONE_BERLIN,
+    TIME_ZONE_JAKARTA,
+    TIME_ZONE_SELFDEFINED,
+    USE_TIME_ZONE_SELFDEFINED,
+    BINANCE_BASE_URL,
+)
+TIME_ZONE_SELFDEFINED = TIME_ZONE_USEASTERN  # If neither of the above is your time zone, you should define it, and set USE_TIME_ZONE_SELFDEFINED 1.
+USE_TIME_ZONE_SELFDEFINED = 1  # 0 (default) or 1 (use the self defined)
+class Quandl(_Base):
+    def __init__(
+        self,
+        data_source: str,
+        start_date: str,
+        end_date: str,
+        time_interval: str,
+        **kwargs,
+    ):
+        super().__init__(data_source, start_date, end_date, time_interval, **kwargs)
+    def download_data(
+        self, ticker_list: List[str], save_path: str = "./data/dataset.csv"
+    ):
+        self.time_zone = calc_time_zone(
+            ticker_list, TIME_ZONE_SELFDEFINED, USE_TIME_ZONE_SELFDEFINED
+        )
+        # Download and save the data in a pandas DataFrame:
+        # data_df = pd.DataFrame()
+        # # set paginate to True because Quandl limits tables API to 10,000 rows per call
+        # data = quandl.get_table('ZACKS/FC', paginate=True, ticker=ticker_list, per_end_date={'gte': '2021-09-01'}, qopts={'columns': ['ticker', 'per_end_date']})
+        # data = quandl.get('ZACKS/FC', ticker=ticker_list,  start_date="2020-12-31", end_date="2021-12-31")
+        self.dataframe = quandl.get_table(
+            "ZACKS/FC",
+            ticker=ticker_list,
+            qopts={"columns": ["ticker", "date", "adjusted_close"]},
+            date={"gte": self.start_date, "lte": self.end_date},
+            paginate=True,
+        )
+        self.dataframe.dropna(inplace=True)
+        self.dataframe.reset_index(drop=True, inplace=True)
+        print("Shape of DataFrame: ", self.dataframe.shape)
+        # print("Display DataFrame: ", data_df.head())
+        self.dataframe.sort_values(by=["date", "ticker"], inplace=True)
+        self.dataframe.reset_index(drop=True, inplace=True)
+        self.save_data(save_path)
+        print(
+            f"Download complete! Dataset saved to {save_path}. \nShape of DataFrame: {self.dataframe.shape}"
+        )
+    # def get_trading_days(self, start, end):
+    #

finnlp/data_processors/quantconnect.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from typing import List
+from meta.config import BINANCE_BASE_URL
+from meta.config import TIME_ZONE_BERLIN
+from meta.config import TIME_ZONE_JAKARTA
+from meta.config import TIME_ZONE_PARIS
+from meta.config import TIME_ZONE_SELFDEFINED
+from meta.config import TIME_ZONE_SHANGHAI
+from meta.config import TIME_ZONE_USEASTERN
+from meta.config import USE_TIME_ZONE_SELFDEFINED
+from meta.data_processors._base import _Base
+# from basic_processor import _Base
+## The code of this file is used in website, not locally.
+class Quantconnect(_Base):
+    def __init__(
+        self,
+        data_source: str,
+        start_date: str,
+        end_date: str,
+        time_interval: str,
+        **kwargs,
+    ):
+        super().__init__(data_source, start_date, end_date, time_interval, **kwargs)
+    # def data_fetch(start_time, end_time, stock_list, resolution=Resolution.Daily) :
+    #     #resolution: Daily, Hour, Minute, Second
+    #     qb = QuantBook()
+    #     for stock in stock_list:
+    #         qb.AddEquity(stock)
+    #     history = qb.History(qb.Securities.Keys, start_time, end_time, resolution)
+    #     return history
+    def download_data(
+        self, ticker_list: List[str], save_path: str = "./data/dataset.csv"
+    ):
+        # self.time_zone = calc_time_zone(ticker_list, TIME_ZONE_SELFDEFINED, USE_TIME_ZONE_SELFDEFINED)
+        # start_date = pd.Timestamp(start_date, tz=self.time_zone)
+        # end_date = pd.Timestamp(end_date, tz=self.time_zone) + pd.Timedelta(days=1)
+        qb = QuantBook()
+        for stock in ticker_list:
+            qb.AddEquity(stock)
+        history = qb.History(
+            qb.Securities.Keys,
+            self.start_date,
+            self.end_date,
+            self.time_interval,
+        )
+        self.dataframe = history
+        self.save_data(save_path)
+        print(
+            f"Download complete! Dataset saved to {save_path}. \nShape of DataFrame: {self.dataframe.shape}"
+        )
+    # def preprocess(df, stock_list):
+    #     df = df[['open','high','low','close','volume']]
+    #     if_first_time = True
+    #     for stock in stock_list:
+    #         if if_first_time:
+    #             ary = df.loc[stock].values
+    #             if_first_time = False
+    #         else:
+    #             temp = df.loc[stock].values
+    #             ary = np.hstack((ary,temp))
+    #     return ary

finnlp/data_processors/ricequant.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from typing import List
+import rqdatac as ricequant
+from meta.data_processors._base import _Base
+class Ricequant(_Base):
+    def __init__(
+        self,
+        data_source: str,
+        start_date: str,
+        end_date: str,
+        time_interval: str,
+        **kwargs,
+    ):
+        super().__init__(data_source, start_date, end_date, time_interval, **kwargs)
+        if kwargs["username"] is None or kwargs["password"] is None:
+            ricequant.init()  # if the lisence is already set, you can init without username and password
+        else:
+            ricequant.init(
+                kwargs["username"], kwargs["password"]
+            )  # init with username and password
+    def download_data(
+        self, ticker_list: List[str], save_path: str = "./data/dataset.csv"
+    ):
+        # download data by calling RiceQuant API
+        dataframe = ricequant.get_price(
+            ticker_list,
+            frequency=self.time_interval,
+            start_date=self.start_date,
+            end_date=self.end_date,
+        )
+        self.dataframe = dataframe
+        self.save_data(save_path)
+        print(
+            f"Download complete! Dataset saved to {save_path}. \nShape of DataFrame: {self.dataframe.shape}"
+        )
+    # def clean_data(self, df) -> pd.DataFrame:
+    #     ''' RiceQuant data is already cleaned, we only need to transform data format here.
+    #     No need for filling NaN data'''
+    #     df = df.copy()
+    #     # raw df uses multi-index (tic,time), reset it to single index (time)
+    #     df = df.reset_index(level=[0,1])
+    #     # rename column order_book_id to tic
+    #     df = df.rename(columns={'order_book_id':'tic', 'datetime':'time'})
+    #     # reserve columns needed
+    #     df = df[['tic','time','open','high','low','close','volume']]
+    #     # check if there is NaN values
+    #     assert not df.isnull().values.any()
+    #     return df
+    # def add_vix(self, data):
+    #     print('VIX is NOT applicable to China A-shares')
+    #     return data
+    # def calculate_turbulence(self, data, time_period=252):
+    #     # can add other market assets
+    #     df = data.copy()
+    #     df_price_pivot = df.pivot(index="date", columns="tic", values="close")
+    #     # use returns to calculate turbulence
+    #     df_price_pivot = df_price_pivot.pct_change()
+    #
+    #     unique_date = df.date.unique()
+    #     # start after a fixed time period
+    #     start = time_period
+    #     turbulence_index = [0] * start
+    #     # turbulence_index = [0]
+    #     count = 0
+    #     for i in range(start, len(unique_date)):
+    #         current_price = df_price_pivot[df_price_pivot.index == unique_date[i]]
+    #         # use one year rolling window to calcualte covariance
+    #         hist_price = df_price_pivot[
+    #             (df_price_pivot.index < unique_date[i])
+    #             & (df_price_pivot.index >= unique_date[i - time_period])
+    #             ]
+    #         # Drop tickers which has number missing values more than the "oldest" ticker
+    #         filtered_hist_price = hist_price.iloc[hist_price.isna().sum().min():].dropna(axis=1)
+    #
+    #         cov_temp = filtered_hist_price.cov()
+    #         current_temp = current_price[[x for x in filtered_hist_price]] - np.mean(filtered_hist_price, axis=0)
+    #         temp = current_temp.values.dot(np.linalg.pinv(cov_temp)).dot(
+    #             current_temp.values.T
+    #         )
+    #         if temp > 0:
+    #             count += 1
+    #             if count > 2:
+    #                 turbulence_temp = temp[0][0]
+    #             else:
+    #                 # avoid large outlier because of the calculation just begins
+    #                 turbulence_temp = 0
+    #         else:
+    #             turbulence_temp = 0
+    #         turbulence_index.append(turbulence_temp)
+    #
+    #     turbulence_index = pd.DataFrame(
+    #         {"date": df_price_pivot.index, "turbulence": turbulence_index}
+    #     )
+    #     return turbulence_index
+    #
+    # def add_turbulence(self, data, time_period=252):
+    #     """
+    #     add turbulence index from a precalcualted dataframe
+    #     :param data: (df) pandas dataframe
+    #     :return: (df) pandas dataframe
+    #     """
+    #     df = data.copy()
+    #     turbulence_index = self.calculate_turbulence(df, time_period=time_period)
+    #     df = df.merge(turbulence_index, on="date")
+    #     df = df.sort_values(["date", "tic"]).reset_index(drop=True)
+    #     return df
+    # def df_to_array(self, df, tech_indicator_list, if_vix):
+    #     df = df.copy()
+    #     unique_ticker = df.tic.unique()
+    #     if_first_time = True
+    #     for tic in unique_ticker:
+    #         if if_first_time:
+    #             price_array = df[df.tic==tic][['close']].values
+    #             tech_array = df[df.tic==tic][tech_indicator_list].values
+    #             #risk_array = df[df.tic==tic]['turbulence'].values
+    #             if_first_time = False
+    #         else:
+    #             price_array = np.hstack([price_array, df[df.tic==tic][['close']].values])
+    #             tech_array = np.hstack([tech_array, df[df.tic==tic][tech_indicator_list].values])
+    #     print('Successfully transformed into array')
+    #     return price_array, tech_array, None

finnlp/data_processors/tushare.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import copy
+import os
+import time
+import warnings
+warnings.filterwarnings("ignore")
+from typing import List
+import pandas as pd
+from tqdm import tqdm
+from matplotlib import pyplot as plt
+import stockstats
+import talib
+from meta.data_processors._base import _Base
+import tushare as ts
+class Tushare(_Base):
+    """
+    key-value in kwargs
+    ----------
+        token : str
+            get from https://waditu.com/ after registration
+        adj: str
+            Whether to use adjusted closing price. Default is None.
+            If you want to use forward adjusted closing price or 前复权. pleses use 'qfq'
+            If you want to use backward adjusted closing price or 后复权. pleses use 'hfq'
+    """
+    def __init__(
+        self,
+        data_source: str,
+        start_date: str,
+        end_date: str,
+        time_interval: str,
+        **kwargs,
+    ):
+        super().__init__(data_source, start_date, end_date, time_interval, **kwargs)
+        assert "token" in kwargs.keys(), "Please input token!"
+        self.token = kwargs["token"]
+        if "adj" in kwargs.keys():
+            self.adj = kwargs["adj"]
+            print(f"Using {self.adj} method.")
+        else:
+            self.adj = None
+    def get_data(self, id) -> pd.DataFrame:
+        # df1 = ts.pro_bar(ts_code=id, start_date=self.start_date,end_date='20180101')
+        # dfb=pd.concat([df, df1], ignore_index=True)
+        # print(dfb.shape)
+        return ts.pro_bar(
+            ts_code=id,
+            start_date=self.start_date,
+            end_date=self.end_date,
+            adj=self.adj,
+        )
+    def download_data(
+        self, ticker_list: List[str], save_path: str = "./data/dataset.csv"
+    ):
+        """
+        `pd.DataFrame`
+            7 columns: A tick symbol, time, open, high, low, close and volume
+            for the specified stock ticker
+        """
+        assert self.time_interval == "1d", "Not supported currently"
+        self.ticker_list = ticker_list
+        ts.set_token(self.token)
+        self.dataframe = pd.DataFrame()
+        for i in tqdm(ticker_list, total=len(ticker_list)):
+            # nonstandard_id = self.transfer_standard_ticker_to_nonstandard(i)
+            # df_temp = self.get_data(nonstandard_id)
+            df_temp = self.get_data(i)
+            self.dataframe = self.dataframe.append(df_temp)
+            # print("{} ok".format(i))
+            time.sleep(0.25)
+        self.dataframe.columns = [
+            "tic",
+            "time",
+            "open",
+            "high",
+            "low",
+            "close",
+            "pre_close",
+            "change",
+            "pct_chg",
+            "volume",
+            "amount",
+        ]
+        self.dataframe.sort_values(by=["time", "tic"], inplace=True)
+        self.dataframe.reset_index(drop=True, inplace=True)
+        self.dataframe = self.dataframe[
+            ["tic", "time", "open", "high", "low", "close", "volume"]
+        ]
+        # self.dataframe.loc[:, 'tic'] = pd.DataFrame((self.dataframe['tic'].tolist()))
+        self.dataframe["time"] = pd.to_datetime(self.dataframe["time"], format="%Y%m%d")
+        self.dataframe["day"] = self.dataframe["time"].dt.dayofweek
+        self.dataframe["time"] = self.dataframe.time.apply(
+            lambda x: x.strftime("%Y-%m-%d")
+        )
+        self.dataframe.dropna(inplace=True)
+        self.dataframe.sort_values(by=["time", "tic"], inplace=True)
+        self.dataframe.reset_index(drop=True, inplace=True)
+        self.save_data(save_path)
+        print(
+            f"Download complete! Dataset saved to {save_path}. \nShape of DataFrame: {self.dataframe.shape}"
+        )
+    def data_split(self, df, start, end, target_date_col="time"):
+        """
+        split the dataset into training or testing using time
+        :param data: (df) pandas dataframe, start, end
+        :return: (df) pandas dataframe
+        """
+        data = df[(df[target_date_col] >= start) & (df[target_date_col] < end)]
+        data = data.sort_values([target_date_col, "tic"], ignore_index=True)
+        data.index = data[target_date_col].factorize()[0]
+        return data
+    def transfer_standard_ticker_to_nonstandard(self, ticker: str) -> str:
+        # "600000.XSHG" -> "600000.SH"
+        # "000612.XSHE" -> "000612.SZ"
+        n, alpha = ticker.split(".")
+        assert alpha in ["XSHG", "XSHE"], "Wrong alpha"
+        if alpha == "XSHG":
+            nonstandard_ticker = n + ".SH"
+        elif alpha == "XSHE":
+            nonstandard_ticker = n + ".SZ"
+        return nonstandard_ticker
+    def save_data(self, path):
+        if ".csv" in path:
+            path = path.split("/")
+            filename = path[-1]
+            path = "/".join(path[:-1] + [""])
+        else:
+            if path[-1] == "/":
+                filename = "dataset.csv"
+            else:
+                filename = "/dataset.csv"
+        os.makedirs(path, exist_ok=True)
+        self.dataframe.to_csv(path + filename, index=False)
+    def load_data(self, path):
+        assert ".csv" in path  # only support csv format now
+        self.dataframe = pd.read_csv(path)
+        columns = self.dataframe.columns
+        assert (
+            "tic" in columns and "time" in columns and "close" in columns
+        )  # input file must have "tic","time" and "close" columns
+class ReturnPlotter:
+    """
+    An easy-to-use plotting tool to plot cumulative returns over time.
+    Baseline supports equal weighting(default) and any stocks you want to use for comparison.
+    """
+    def __init__(self, df_account_value, df_trade, start_date, end_date):
+        self.start = start_date
+        self.end = end_date
+        self.trade = df_trade
+        self.df_account_value = df_account_value
+    def get_baseline(self, ticket):
+        df = ts.get_hist_data(ticket, start=self.start, end=self.end)
+        df.loc[:, "dt"] = df.index
+        df.index = range(len(df))
+        df.sort_values(axis=0, by="dt", ascending=True, inplace=True)
+        df["time"] = pd.to_datetime(df["dt"], format="%Y-%m-%d")
+        return df
+    def plot(self, baseline_ticket=None):
+        """
+        Plot cumulative returns over time.
+        use baseline_ticket to specify stock you want to use for comparison
+        (default: equal weighted returns)
+        """
+        baseline_label = "Equal-weight portfolio"
+        tic2label = {"399300": "CSI 300 Index", "000016": "SSE 50 Index"}
+        if baseline_ticket:
+            # 使用指定ticket作为baseline
+            baseline_df = self.get_baseline(baseline_ticket)
+            baseline_date_list = baseline_df.time.dt.strftime("%Y-%m-%d").tolist()
+            df_date_list = self.df_account_value.time.tolist()
+            df_account_value = self.df_account_value[
+                self.df_account_value.time.isin(baseline_date_list)
+            ]
+            baseline_df = baseline_df[baseline_df.time.isin(df_date_list)]
+            baseline = baseline_df.close.tolist()
+            baseline_label = tic2label.get(baseline_ticket, baseline_ticket)
+            ours = df_account_value.account_value.tolist()
+        else:
+            # 均等权重
+            all_date = self.trade.time.unique().tolist()
+            baseline = []
+            for day in all_date:
+                day_close = self.trade[self.trade["time"] == day].close.tolist()
+                avg_close = sum(day_close) / len(day_close)
+                baseline.append(avg_close)
+            ours = self.df_account_value.account_value.tolist()
+        ours = self.pct(ours)
+        baseline = self.pct(baseline)
+        days_per_tick = (
+            60  # you should scale this variable accroding to the total trading days
+        )
+        time = list(range(len(ours)))
+        datetimes = self.df_account_value.time.tolist()
+        ticks = [tick for t, tick in zip(time, datetimes) if t % days_per_tick == 0]
+        plt.title("Cumulative Returns")
+        plt.plot(time, ours, label="DDPG Agent", color="green")
+        plt.plot(time, baseline, label=baseline_label, color="grey")
+        plt.xticks([i * days_per_tick for i in range(len(ticks))], ticks, fontsize=7)
+        plt.xlabel("Date")
+        plt.ylabel("Cumulative Return")
+        plt.legend()
+        plt.show()
+        plt.savefig(f"plot_{baseline_ticket}.png")
+    def plot_all(self):
+        baseline_label = "Equal-weight portfolio"
+        tic2label = {"399300": "CSI 300 Index", "000016": "SSE 50 Index"}
+        # time lists
+        # algorithm time list
+        df_date_list = self.df_account_value.time.tolist()
+        # 399300 time list
+        csi300_df = self.get_baseline("399300")
+        csi300_date_list = csi300_df.time.dt.strftime("%Y-%m-%d").tolist()
+        # 000016 time list
+        sh50_df = self.get_baseline("000016")
+        sh50_date_list = sh50_df.time.dt.strftime("%Y-%m-%d").tolist()
+        # find intersection
+        all_date = sorted(
+            list(set(df_date_list) & set(csi300_date_list) & set(sh50_date_list))
+        )
+        # filter data
+        csi300_df = csi300_df[csi300_df.time.isin(all_date)]
+        baseline_300 = csi300_df.close.tolist()
+        baseline_label_300 = tic2label["399300"]
+        sh50_df = sh50_df[sh50_df.time.isin(all_date)]
+        baseline_50 = sh50_df.close.tolist()
+        baseline_label_50 = tic2label["000016"]
+        # 均等权重
+        baseline_equal_weight = []
+        for day in all_date:
+            day_close = self.trade[self.trade["time"] == day].close.tolist()
+            avg_close = sum(day_close) / len(day_close)
+            baseline_equal_weight.append(avg_close)
+        df_account_value = self.df_account_value[
+            self.df_account_value.time.isin(all_date)
+        ]
+        ours = df_account_value.account_value.tolist()
+        ours = self.pct(ours)
+        baseline_300 = self.pct(baseline_300)
+        baseline_50 = self.pct(baseline_50)
+        baseline_equal_weight = self.pct(baseline_equal_weight)
+        days_per_tick = (
+            60  # you should scale this variable accroding to the total trading days
+        )
+        time = list(range(len(ours)))
+        datetimes = self.df_account_value.time.tolist()
+        ticks = [tick for t, tick in zip(time, datetimes) if t % days_per_tick == 0]
+        plt.title("Cumulative Returns")
+        plt.plot(time, ours, label="DDPG Agent", color="darkorange")
+        plt.plot(
+            time,
+            baseline_equal_weight,
+            label=baseline_label,
+            color="cornflowerblue",
+        )  # equal weight
+        plt.plot(
+            time, baseline_300, label=baseline_label_300, color="lightgreen"
+        )  # 399300
+        plt.plot(time, baseline_50, label=baseline_label_50, color="silver")  # 000016
+        plt.xlabel("Date")
+        plt.ylabel("Cumulative Return")
+        plt.xticks([i * days_per_tick for i in range(len(ticks))], ticks, fontsize=7)
+        plt.legend()
+        plt.show()
+        plt.savefig("./plot_all.png")
+    def pct(self, l):
+        """Get percentage"""
+        base = l[0]
+        return [x / base for x in l]
+    def get_return(self, df, value_col_name="account_value"):
+        df = copy.deepcopy(df)
+        df["daily_return"] = df[value_col_name].pct_change(1)
+        df["time"] = pd.to_datetime(df["time"], format="%Y-%m-%d")
+        df.set_index("time", inplace=True, drop=True)
+        df.index = df.index.tz_localize("UTC")
+        return pd.Series(df["daily_return"], index=df.index)

finnlp/data_processors/wrds.py ADDED Viewed

	@@ -0,0 +1,330 @@

+import datetime
+from typing import List
+import numpy as np
+import pandas as pd
+import pytz
+import wrds
+try:
+    import exchange_calendars as tc
+except:
+    print(
+        "Cannot import exchange_calendars.",
+        "If you are using python>=3.7, please install it.",
+    )
+    import trading_calendars as tc
+    print("Use trading_calendars instead for wrds processor.")
+# from basic_processor import _Base
+from meta.data_processors._base import _Base
+pd.options.mode.chained_assignment = None
+class Wrds(_Base):
+    # def __init__(self,if_offline=False):
+    #     if not if_offline:
+    #         self.db = wrds.Connection()
+    def __init__(
+        self,
+        data_source: str,
+        start_date: str,
+        end_date: str,
+        time_interval: str,
+        **kwargs,
+    ):
+        super().__init__(data_source, start_date, end_date, time_interval, **kwargs)
+        if "if_offline" in kwargs.keys() and not kwargs["if_offline"]:
+            self.db = wrds.Connection()
+    def download_data(
+        self,
+        ticker_list: List[str],
+        if_save_tempfile=False,
+        filter_shares=0,
+        save_path: str = "./data/dataset.csv",
+    ):
+        dates = self.get_trading_days(self.start_date, self.end_date)
+        print("Trading days: ")
+        print(dates)
+        first_time = True
+        empty = True
+        stock_set = tuple(ticker_list)
+        for i in dates:
+            x = self.data_fetch_wrds(i, stock_set, filter_shares, self.time_interval)
+            if not x[1]:
+                empty = False
+                dataset = x[0]
+                dataset = self.preprocess_to_ohlcv(
+                    dataset, time_interval=(str(self.time_interval) + "S")
+                )
+                print("Data for date: " + i + " finished")
+                if first_time:
+                    temp = dataset
+                    first_time = False
+                else:
+                    temp = pd.concat([temp, dataset])
+                if if_save_tempfile:
+                    temp.to_csv("./temp.csv")
+        if empty:
+            raise ValueError("Empty Data under input parameters!")
+        result = temp
+        result = result.sort_values(by=["time", "tic"])
+        result = result.reset_index(drop=True)
+        self.dataframe = result
+        self.save_data(save_path)
+        print(
+            f"Download complete! Dataset saved to {save_path}. \nShape of DataFrame: {self.dataframe.shape}"
+        )
+    def preprocess_to_ohlcv(self, df, time_interval="60S"):
+        df = df[["date", "time_m", "sym_root", "size", "price"]]
+        tic_list = np.unique(df["sym_root"].values)
+        final_df = None
+        first_time = True
+        for i in range(len(tic_list)):
+            tic = tic_list[i]
+            time_list = []
+            temp_df = df[df["sym_root"] == tic]
+            for i in range(temp_df.shape[0]):
+                date = temp_df["date"].iloc[i]
+                time_m = temp_df["time_m"].iloc[i]
+                time = str(date) + " " + str(time_m)
+                try:
+                    time = datetime.datetime.strptime(time, "%Y-%m-%d %H:%M:%S.%f")
+                except:
+                    time = datetime.datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
+                time_list.append(time)
+            temp_df["time"] = time_list
+            temp_df = temp_df.set_index("time")
+            data_ohlc = temp_df["price"].resample(time_interval).ohlc()
+            data_v = temp_df["size"].resample(time_interval).agg({"size": "sum"})
+            volume = data_v["size"].values
+            data_ohlc["volume"] = volume
+            data_ohlc["tic"] = tic
+            if first_time:
+                final_df = data_ohlc.reset_index()
+                first_time = False
+            else:
+                final_df = final_df.append(data_ohlc.reset_index(), ignore_index=True)
+        return final_df
+    def clean_data(self):
+        df = self.dataframe[["time", "open", "high", "low", "close", "volume", "tic"]]
+        # remove 16:00 data
+        tic_list = np.unique(df["tic"].values)
+        ary = df.values
+        rows_1600 = []
+        for i in range(ary.shape[0]):
+            row = ary[i]
+            time = row[0]
+            if str(time)[-8:] == "16:00:00":
+                rows_1600.append(i)
+        df = df.drop(rows_1600)
+        df = df.sort_values(by=["tic", "time"])
+        # check missing rows
+        tic_dic = {tic: [0, 0] for tic in tic_list}
+        ary = df.values
+        for i in range(ary.shape[0]):
+            row = ary[i]
+            volume = row[5]
+            tic = row[6]
+            if volume != 0:
+                tic_dic[tic][0] += 1
+            tic_dic[tic][1] += 1
+        constant = np.unique(df["time"].values).shape[0]
+        nan_tics = [tic for tic, value in tic_dic.items() if value[1] != constant]
+        # fill missing rows
+        normal_time = np.unique(df["time"].values)
+        df2 = df.copy()
+        for tic in nan_tics:
+            tic_time = df[df["tic"] == tic]["time"].values
+            missing_time = [i for i in normal_time if i not in tic_time]
+            for time in missing_time:
+                temp_df = pd.DataFrame(
+                    [[time, np.nan, np.nan, np.nan, np.nan, 0, tic]],
+                    columns=[
+                        "time",
+                        "open",
+                        "high",
+                        "low",
+                        "close",
+                        "volume",
+                        "tic",
+                    ],
+                )
+                df2 = df2.append(temp_df, ignore_index=True)
+        # fill nan data
+        df = df2.sort_values(by=["tic", "time"])
+        for i in range(df.shape[0]):
+            if float(df.iloc[i]["volume"]) == 0:
+                previous_close = df.iloc[i - 1]["close"]
+                if str(previous_close) == "nan":
+                    raise ValueError("Error nan price")
+                df.iloc[i, 1] = previous_close
+                df.iloc[i, 2] = previous_close
+                df.iloc[i, 3] = previous_close
+                df.iloc[i, 4] = previous_close
+        # check if nan
+        ary = df[["open", "high", "low", "close", "volume"]].values
+        assert np.isnan(np.min(ary)) == False
+        # final preprocess
+        df = df[["time", "open", "high", "low", "close", "volume", "tic"]]
+        df = df.reset_index(drop=True)
+        print("Data clean finished")
+        self.dataframe = df
+    def get_trading_days(self, start, end):
+        nyse = tc.get_calendar("NYSE")
+        df = nyse.sessions_in_range(
+            pd.Timestamp(start, tz=pytz.UTC), pd.Timestamp(end, tz=pytz.UTC)
+        )
+        return [str(day)[:10] for day in df]
+    def data_fetch_wrds(
+        self,
+        date="2021-05-01",
+        stock_set=("AAPL"),
+        filter_shares=0,
+        time_interval=60,
+    ):
+        # start_date, end_date should be in the same year
+        current_date = datetime.datetime.strptime(date, "%Y-%m-%d")
+        lib = "taqm_" + str(current_date.year)  # taqm_2021
+        table = "ctm_" + current_date.strftime("%Y%m%d")  # ctm_20210501
+        parm = {"syms": stock_set, "num_shares": filter_shares}
+        try:
+            data = self.db.raw_sql(
+                "select * from "
+                + lib
+                + "."
+                + table
+                + " where sym_root in %(syms)s and time_m between '9:30:00' and '16:00:00' and size > %(num_shares)s and sym_suffix is null",
+                params=parm,
+            )
+            if_empty = False
+            return data, if_empty
+        except:
+            print("Data for date: " + date + " error")
+            if_empty = True
+            return None, if_empty
+    # def add_technical_indicator(self, df, tech_indicator_list = [
+    #         'macd', 'boll_ub', 'boll_lb', 'rsi_30', 'dx_30',
+    #         'close_30_sma', 'close_60_sma']):
+    #     df = df.rename(columns={'time':'date'})
+    #     df = df.copy()
+    #     df = df.sort_values(by=['tic', 'date'])
+    #     stock = Sdf.retype(df.copy())
+    #     unique_ticker = stock.tic.unique()
+    #     tech_indicator_list = tech_indicator_list
+    #
+    #     for indicator in tech_indicator_list:
+    #         indicator_df = pd.DataFrame()
+    #         for i in range(len(unique_ticker)):
+    #             # print(unique_ticker[i], i)
+    #             temp_indicator = stock[stock.tic == unique_ticker[i]][indicator]
+    #             temp_indicator = pd.DataFrame(temp_indicator)
+    #             temp_indicator['tic'] = unique_ticker[i]
+    #             # print(len(df[df.tic == unique_ticker[i]]['date'].to_list()))
+    #             temp_indicator['date'] = df[df.tic == unique_ticker[i]]['date'].to_list()
+    #             indicator_df = indicator_df.append(
+    #                 temp_indicator, ignore_index=True
+    #             )
+    #         df = df.merge(indicator_df[['tic', 'date', indicator]], on=['tic', 'date'], how='left')
+    #     df = df.sort_values(by=['date', 'tic'])
+    #     print('Succesfully add technical indicators')
+    #     return df
+    # def calculate_turbulence(self,data, time_period=252):
+    #     # can add other market assets
+    #     df = data.copy()
+    #     df_price_pivot = df.pivot(index="date", columns="tic", values="close")
+    #     # use returns to calculate turbulence
+    #     df_price_pivot = df_price_pivot.pct_change()
+    #
+    #     unique_date = df.date.unique()
+    #     # start after a fixed time period
+    #     start = time_period
+    #     turbulence_index = [0] * start
+    #     # turbulence_index = [0]
+    #     count = 0
+    #     for i in range(start, len(unique_date)):
+    #         current_price = df_price_pivot[df_price_pivot.index == unique_date[i]]
+    #         # use one year rolling window to calcualte covariance
+    #         hist_price = df_price_pivot[
+    #             (df_price_pivot.index < unique_date[i])
+    #             & (df_price_pivot.index >= unique_date[i - time_period])
+    #             ]
+    #         # Drop tickers which has number missing values more than the "oldest" ticker
+    #         filtered_hist_price = hist_price.iloc[hist_price.isna().sum().min():].dropna(axis=1)
+    #
+    #         cov_temp = filtered_hist_price.cov()
+    #         current_temp = current_price[[x for x in filtered_hist_price]] - np.mean(filtered_hist_price, axis=0)
+    #         temp = current_temp.values.dot(np.linalg.pinv(cov_temp)).dot(
+    #             current_temp.values.T
+    #         )
+    #         if temp > 0:
+    #             count += 1
+    #             if count > 2:
+    #                 turbulence_temp = temp[0][0]
+    #             else:
+    #                 # avoid large outlier because of the calculation just begins
+    #                 turbulence_temp = 0
+    #         else:
+    #             turbulence_temp = 0
+    #         turbulence_index.append(turbulence_temp)
+    #
+    #     turbulence_index = pd.DataFrame(
+    #         {"date": df_price_pivot.index, "turbulence": turbulence_index}
+    #     )
+    #     return turbulence_index
+    #
+    # def add_turbulence(self,data, time_period=252):
+    #     """
+    #     add turbulence index from a precalcualted dataframe
+    #     :param data: (df) pandas dataframe
+    #     :return: (df) pandas dataframe
+    #     """
+    #     df = data.copy()
+    #     turbulence_index = self.calculate_turbulence(df, time_period=time_period)
+    #     df = df.merge(turbulence_index, on="date")
+    #     df = df.sort_values(["date", "tic"]).reset_index(drop=True)
+    #     return df
+    # def add_vix(self, data):
+    #     vix_df = self.download_data(['vix'], self.start, self.end_date, self.time_interval)
+    #     cleaned_vix = self.clean_data(vix_df)
+    #     vix = cleaned_vix[['date','close']]
+    #
+    #     df = data.copy()
+    #     df = df.merge(vix, on="date")
+    #     df = df.sort_values(["date", "tic"]).reset_index(drop=True)
+    #
+    #     return df
+    # def df_to_array(self,df,tech_indicator_list):
+    #     unique_ticker = df.tic.unique()
+    #     print(unique_ticker)
+    #     if_first_time = True
+    #     for tic in unique_ticker:
+    #         if if_first_time:
+    #             price_array = df[df.tic==tic][['close']].values
+    #             #price_ary = df[df.tic==tic]['close'].values
+    #             tech_array = df[df.tic==tic][tech_indicator_list].values
+    #             risk_array = df[df.tic==tic]['turbulence'].values
+    #             if_first_time = False
+    #         else:
+    #             price_array = np.hstack([price_array, df[df.tic==tic][['close']].values])
+    #             tech_array = np.hstack([tech_array, df[df.tic==tic][tech_indicator_list].values])
+    #     print('Successfully transformed into array')
+    #     return price_array, tech_array, risk_array

finnlp/data_processors/yahoofinance.py ADDED Viewed

	@@ -0,0 +1,190 @@

+from typing import List
+import numpy as np
+import pandas as pd
+import pytz
+import yfinance as yf
+try:
+    import exchange_calendars as tc
+except:
+    print(
+        "Cannot import exchange_calendars.",
+        "If you are using python>=3.7, please install it.",
+    )
+    import trading_calendars as tc
+    print("Use trading_calendars instead for yahoofinance processor..")
+from finnlp.utils.config import (
+    BINANCE_BASE_URL,
+    TIME_ZONE_BERLIN,
+    TIME_ZONE_JAKARTA,
+    TIME_ZONE_PARIS,
+    TIME_ZONE_SELFDEFINED,
+    TIME_ZONE_SHANGHAI,
+    TIME_ZONE_USEASTERN,
+    USE_TIME_ZONE_SELFDEFINED,
+)
+from finnlp.data_processors._base import _Base, calc_time_zone
+class Yahoofinance(_Base):
+    def __init__(
+        self,
+        data_source: str,
+        start_date: str,
+        end_date: str,
+        time_interval: str,
+        **kwargs,
+    ):
+        super().__init__(data_source, start_date, end_date, time_interval, **kwargs)
+    def download_data(
+        self, ticker_list: List[str], save_path: str = "./data/dataset.csv"
+    ):
+        self.time_zone = calc_time_zone(
+            ticker_list, TIME_ZONE_SELFDEFINED, USE_TIME_ZONE_SELFDEFINED
+        )
+        self.dataframe = pd.DataFrame()
+        for tic in ticker_list:
+            temp_df = yf.download(
+                tic,
+                start=self.start_date,
+                end=self.end_date,
+                interval=self.time_interval,
+            )
+            temp_df["tic"] = tic
+            self.dataframe = pd.concat([self.dataframe, temp_df], axis=0, join="outer")
+        self.dataframe.reset_index(inplace=True)
+        try:
+            self.dataframe.columns = [
+                "date",
+                "open",
+                "high",
+                "low",
+                "close",
+                "adjusted_close",
+                "volume",
+                "tic",
+            ]
+        except NotImplementedError:
+            print("the features are not supported currently")
+        self.dataframe["day"] = self.dataframe["date"].dt.dayofweek
+        print(self.dataframe)
+        self.dataframe["date"] = self.dataframe.date.apply(
+            lambda x: x.strftime("%Y-%m-%d")
+        )
+        self.dataframe.dropna(inplace=True)
+        self.dataframe.reset_index(drop=True, inplace=True)
+        print("Shape of DataFrame: ", self.dataframe.shape)
+        self.dataframe.sort_values(by=["date", "tic"], inplace=True)
+        self.dataframe.reset_index(drop=True, inplace=True)
+        self.save_data(save_path)
+        print(
+            f"Download complete! Dataset saved to {save_path}. \nShape of DataFrame: {self.dataframe.shape}"
+        )
+    def clean_data(self):
+        df = self.dataframe.copy()
+        df = df.rename(columns={"date": "time"})
+        time_interval = self.time_interval
+        tic_list = np.unique(df.tic.values)
+        trading_days = self.get_trading_days(start=self.start_date, end=self.end_date)
+        if time_interval == "1D":
+            times = trading_days
+        elif time_interval == "1Min":
+            times = []
+            for day in trading_days:
+                current_time = pd.Timestamp(day + " 09:30:00").tz_localize(
+                    self.time_zone
+                )
+                for _ in range(390):
+                    times.append(current_time)
+                    current_time += pd.Timedelta(minutes=1)
+        else:
+            raise ValueError(
+                "Data clean at given time interval is not supported for YahooFinance data."
+            )
+        new_df = pd.DataFrame()
+        for tic in tic_list:
+            print(("Clean data for ") + tic)
+            tmp_df = pd.DataFrame(
+                columns=[
+                    "open",
+                    "high",
+                    "low",
+                    "close",
+                    "adjusted_close",
+                    "volume",
+                ],
+                index=times,
+            )
+            # get data for current ticker
+            tic_df = df[df.tic == tic]
+            # fill empty DataFrame using orginal data
+            for i in range(tic_df.shape[0]):
+                tmp_df.loc[tic_df.iloc[i]["time"]] = tic_df.iloc[i][
+                    [
+                        "open",
+                        "high",
+                        "low",
+                        "close",
+                        "adjusted_close",
+                        "volume",
+                    ]
+                ]
+            # if close on start date is NaN, fill data with first valid close
+            # and set volume to 0.
+            if str(tmp_df.iloc[0]["close"]) == "nan":
+                print("NaN data on start date, fill using first valid data.")
+                for i in range(tmp_df.shape[0]):
+                    if str(tmp_df.iloc[i]["close"]) != "nan":
+                        first_valid_close = tmp_df.iloc[i]["close"]
+                        first_valid_adjclose = tmp_df.iloc[i]["adjusted_close"]
+                tmp_df.iloc[0] = [
+                    first_valid_close,
+                    first_valid_close,
+                    first_valid_close,
+                    first_valid_close,
+                    first_valid_adjclose,
+                    0.0,
+                ]
+            # fill NaN data with previous close and set volume to 0.
+            for i in range(tmp_df.shape[0]):
+                if str(tmp_df.iloc[i]["close"]) == "nan":
+                    previous_close = tmp_df.iloc[i - 1]["close"]
+                    previous_adjusted_close = tmp_df.iloc[i - 1]["adjusted_close"]
+                    if str(previous_close) == "nan":
+                        raise ValueError
+                    tmp_df.iloc[i] = [
+                        previous_close,
+                        previous_close,
+                        previous_close,
+                        previous_close,
+                        previous_adjusted_close,
+                        0.0,
+                    ]
+            # merge single ticker data to new DataFrame
+            tmp_df = tmp_df.astype(float)
+            tmp_df["tic"] = tic
+            new_df = new_df.append(tmp_df)
+            print(("Data clean for ") + tic + (" is finished."))
+        # reset index and rename columns
+        new_df = new_df.reset_index()
+        new_df = new_df.rename(columns={"index": "time"})
+        print("Data clean all finished!")
+        self.dataframe = new_df
+    def get_trading_days(self, start, end):
+        nyse = tc.get_calendar("NYSE")
+        df = nyse.sessions_in_range(pd.Timestamp(start), pd.Timestamp(end))
+        return [str(day)[:10] for day in df]