Merge pull request #7 from aoki-h-jp/feature/1.0.0/information-correl…

…ation Feature/1.0.0/information correlation
aoki-h-jp · Sep 3, 2023 · d499c05 · d499c05
2 parents 32ef425 + 783ca35
commit d499c05
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 28 deletions.
diff --git a/crypto_features/feature/information_correlation.py b/crypto_features/feature/information_correlation.py
@@ -33,11 +33,11 @@ def run_calculate(
         if not os.path.exists("information_correlation"):
             os.mkdir("information_correlation")
 
-        klines = klines.copy()
-        feature = feature.copy()
-
         close_chg_pct_header = f"close_chg_pct_after_{return_minutes}min"
-        klines[close_chg_pct_header] = klines["close"].pct_change(return_minutes)
+        klines["close"] = klines["close"].astype(float)
+        klines[close_chg_pct_header] = klines["close"].pct_change(
+            return_minutes, fill_method="bfill"
+        )
         klines[close_chg_pct_header] = klines[close_chg_pct_header].shift(
             -return_minutes
         )

diff --git a/crypto_features/feature/preprocessing.py b/crypto_features/feature/preprocessing.py
@@ -29,29 +29,7 @@ def _load_klines_data(self, symbol) -> pd.DataFrame:
         :return: preprocessed klines data
         """
         # Load klines data
-        # merge all csv files
-        df = pd.DataFrame()
-        for file in os.listdir(
-            os.path.join(self._data_dir, self._BINANCE_KLINES_DIR, symbol, "1m")
-        ):
-            df = pd.concat(
-                [
-                    df,
-                    pd.read_csv(
-                        "/".join(
-                            [
-                                self._data_dir,
-                                self._BINANCE_KLINES_DIR,
-                                symbol,
-                                "1m",
-                                file,
-                            ]
-                        )
-                    ),
-                ]
-            )
-
-        df.columns = [
+        headers = [
             "timestamp_open",
             "open",
             "high",
@@ -65,8 +43,74 @@ def _load_klines_data(self, symbol) -> pd.DataFrame:
             "taker_buy_quote_volume",
             "ignore",
         ]
-        df["timestamp_open"] = pd.to_datetime(df["timestamp_open"], utc=True, unit="ms")
+
+        raw_headers = [
+            "open_time",
+            "open",
+            "high",
+            "low",
+            "close",
+            "volume",
+            "close_time",
+            "quote_volume",
+            "count",
+            "taker_buy_volume",
+            "taker_buy_quote_volume",
+            "ignore",
+        ]
+
+        # merge all csv files
+        df = pd.DataFrame(columns=headers)
+        for file in os.listdir(
+            os.path.join(self._data_dir, self._BINANCE_KLINES_DIR, symbol, "1m")
+        ):
+            # header check
+            df_append_tmp = pd.read_csv(
+                "/".join(
+                    [
+                        self._data_dir,
+                        self._BINANCE_KLINES_DIR,
+                        symbol,
+                        "1m",
+                        file,
+                    ]
+                ),
+                nrows=1,
+            )
+
+            if list(df_append_tmp) != raw_headers:
+                df_append = pd.read_csv(
+                    "/".join(
+                        [
+                            self._data_dir,
+                            self._BINANCE_KLINES_DIR,
+                            symbol,
+                            "1m",
+                            file,
+                        ]
+                    ),
+                    names=headers,
+                )
+            else:
+                df_append = pd.read_csv(
+                    "/".join(
+                        [
+                            self._data_dir,
+                            self._BINANCE_KLINES_DIR,
+                            symbol,
+                            "1m",
+                            file,
+                        ]
+                    ),
+                    header=None,
+                )
+                df_append = df_append.drop(0, axis=0)
+                df_append.columns = headers
+
+            df = pd.concat([df, df_append])
+
         df.set_index("timestamp_open", inplace=True)
+        df.index = pd.to_datetime(df.index, utc=True, unit="ms")
 
         return df