online serving v10

microsoft · you-n-g · May 17, 2021 · Feb 16, 2021 · Feb 17, 2021 · Feb 26, 2021
commit 9dfd001f6fafbe077dcdc30feacd0dbeb7bf31e6
diff --git a/docs/advanced/task_management.rst b/docs/advanced/task_management.rst
@@ -55,6 +55,7 @@ More information of ``Task Manager`` can be found in `here <../reference/api.htm
 
 Task Training
 ===============
+#FIXME: Trainer
 After generating and storing those ``task``, it's time to run the ``task`` which are in the *WAITING* status.
 ``Qlib`` provides a method called ``run_task`` to run those ``task`` in task pool, however, users can also customize how tasks are executed.
 An easy way to get the ``task_func`` is using ``qlib.model.trainer.task_train`` directly.

diff --git a/examples/online_srv/online_management_simulate.py b/examples/online_srv/online_management_simulate.py
@@ -8,6 +8,7 @@
 import fire
 import qlib
 from qlib.model.trainer import DelayTrainerRM
+from qlib.workflow import R
 from qlib.workflow.online.manager import OnlineManager
 from qlib.workflow.online.strategy import RollingAverageStrategy
 from qlib.workflow.task.gen import RollingGen
@@ -110,31 +111,37 @@ def __init__(
         }
         qlib.init(provider_uri=provider_uri, region=region, mongo=mongo_conf)
         self.rolling_gen = RollingGen(
-            step=rolling_step, rtype=RollingGen.ROLL_SD, modify_end_time=False
-        )  # The rolling tasks generator, modify_end_time is false because we just need simulate to 2018-10-31.
+            step=rolling_step, rtype=RollingGen.ROLL_SD, ds_extra_mod_func=None
+        )  # The rolling tasks generator, ds_extra_mod_func is None because we just need simulate to 2018-10-31 and needn't change handler end time.
         self.trainer = DelayTrainerRM(self.exp_name, self.task_pool)
         self.task_manager = TaskManager(self.task_pool)  # A good way to manage all your tasks
         self.rolling_online_manager = OnlineManager(
-            RollingAverageStrategy(
-                exp_name, task_template=tasks, rolling_gen=self.rolling_gen, trainer=self.trainer, need_log=False
-            ),
+            RollingAverageStrategy(exp_name, task_template=tasks, rolling_gen=self.rolling_gen, need_log=False),
+            trainer=self.trainer,
             begin_time=self.start_time,
             need_log=False,
         )
         self.tasks = tasks
 
+    # Reset all things to the first status, be careful to save important data
+    def reset(self):
+        TaskManager(self.task_pool).remove()
+        exp = R.get_exp(experiment_name=self.exp_name)
+        for rid in exp.list_recorders():
+            exp.delete_recorder(rid)
+
     # Run this to run all workflow automatically
     def main(self):
         print("========== reset ==========")
-        self.rolling_online_manager.reset()
+        self.reset()
         print("========== simulate ==========")
         self.rolling_online_manager.simulate(end_time=self.end_time)
         print("========== collect results ==========")
         print(self.rolling_online_manager.get_collector()())
         print("========== signals ==========")
         print(self.rolling_online_manager.get_signals())
         print("========== online history ==========")
-        print(self.rolling_online_manager.get_online_history(self.exp_name))
+        print(self.rolling_online_manager.history)
 
 
 if __name__ == "__main__":

diff --git a/examples/online_srv/rolling_online_management.py b/examples/online_srv/rolling_online_management.py
@@ -18,8 +18,6 @@
 from qlib.workflow.task.gen import RollingGen
 from qlib.workflow.task.manage import TaskManager
 from qlib.workflow.online.manager import OnlineManager
-from qlib.workflow.task.utils import list_recorders
-from qlib.model.trainer import TrainerRM
 
 data_handler_config = {
     "start_time": "2013-01-01",
@@ -86,7 +84,7 @@ def __init__(
         task_url="mongodb://10.0.0.4:27017/",
         task_db_name="rolling_db",
         rolling_step=550,
-        tasks=[task_xgboost_config],  # , task_lgb_config],
+        tasks=[task_xgboost_config, task_lgb_config],
     ):
         mongo_conf = {
             "task_url": task_url,  # your MongoDB url
@@ -103,7 +101,6 @@ def __init__(
                     name_id,
                     task,
                     RollingGen(step=rolling_step, rtype=RollingGen.ROLL_SD),
-                    TrainerRM(experiment_name=name_id, task_pool=name_id),
                 )
             )
 
@@ -116,9 +113,8 @@ def __init__(
 
     # Reset all things to the first status, be careful to save important data
     def reset(self):
-        print("========== reset ==========")
         for task in self.tasks:
-            name_id = task["model"]["class"] + "_" + str(self.rolling_step)
+            name_id = task["model"]["class"]
             TaskManager(name_id).remove()
             exp = R.get_exp(experiment_name=name_id)
             for rid in exp.list_recorders():
@@ -127,12 +123,9 @@ def reset(self):
             if os.path.exists(self._ROLLING_MANAGER_PATH):
                 os.remove(self._ROLLING_MANAGER_PATH)
 
-            for rid in list_recorders("OnlineManagerSignals", lambda x: True if x.info["name"] == name_id else False):
-                exp.delete_recorder(rid)
-
     def first_run(self):
         print("========== reset ==========")
-        self.rolling_online_manager.reset()
+        self.reset()
         print("========== first_run ==========")
         self.rolling_online_manager.first_train()
         print("========== dump ==========")

diff --git a/qlib/model/ens/ensemble.py b/qlib/model/ens/ensemble.py
@@ -7,6 +7,7 @@
 
 from typing import Union
 import pandas as pd
+from qlib.utils import flatten_dict
 
 
 class Ensemble:
@@ -77,19 +78,22 @@ def __call__(self, ensemble_dict: dict) -> pd.DataFrame:
 class AverageEnsemble(Ensemble):
     def __call__(self, ensemble_dict: dict):
         """
-        Average a dict of same shape dataframe like `prediction` or `IC` into an ensemble.
+        Average and standardize a dict of same shape dataframe like `prediction` or `IC` into an ensemble.
 
-        NOTE: The values of dict must be pd.DataFrame, and have the index "datetime"
+        NOTE: The values of dict must be pd.DataFrame, and have the index "datetime". If it is a nested dict, then flat it.
 
         Args:
             ensemble_dict (dict): a dict like {"A": pd.DataFrame, "B": pd.DataFrame}.
             The key of the dict will be ignored.
 
         Returns:
-            pd.DataFrame: the complete result of averaging.
+            pd.DataFrame: the complete result of averaging and standardizing.
         """
+        # need to flatten the nested dict
+        ensemble_dict = flatten_dict(ensemble_dict)
         values = list(ensemble_dict.values())
         results = pd.concat(values, axis=1)
-        results = results.mean(axis=1).to_frame("score")
+        results = results.groupby("datetime").apply(lambda df: (df - df.mean()) / df.std())
+        results = results.mean(axis=1)
         results = results.sort_index()
         return results
diff --git a/qlib/model/ens/group.py b/qlib/model/ens/group.py
@@ -36,20 +36,36 @@ def __init__(self, group_func=None, ens: Ensemble = None):
         self._ens_func = ens
 
     def group(self, *args, **kwargs) -> dict:
-        # TODO: such design is weird when `_group_func` is the only configurable part in the class
+        """
+        Group a set of object and change them to a dict.
+
+        For example: {(A,B,C1): object, (A,B,C2): object} -> {(A,B): {C1: object, C2: object}}
+
+        Returns:
+            dict: grouped dict
+        """
         if isinstance(getattr(self, "_group_func", None), Callable):
             return self._group_func(*args, **kwargs)
         else:
             raise NotImplementedError(f"Please specify valid `group_func`.")
 
     def reduce(self, *args, **kwargs) -> dict:
+        """
+        Reduce grouped dict in some way.
+
+        For example: {(A,B): {C1: object, C2: object}} -> {(A,B): object}
+
+        Returns:
+            dict: reduced dict
+        """
         if isinstance(getattr(self, "_ens_func", None), Callable):
             return self._ens_func(*args, **kwargs)
         else:
             raise NotImplementedError(f"Please specify valid `_ens_func`.")
 
     def __call__(self, ungrouped_dict: dict, n_jobs=1, verbose=0, *args, **kwargs) -> dict:
-        """Group the ungrouped_dict into different groups.
+        """
+        Group the ungrouped_dict into different groups.
 
         Args:
             ungrouped_dict (dict): the ungrouped dict waiting for grouping like {name: things}