trainer & group & collect & ensemble

microsoft · you-n-g · May 17, 2021 · Feb 16, 2021 · Feb 17, 2021 · Feb 26, 2021
commit bd7a1c11b981099cdbe9a69429e3566be36854be
diff --git a/examples/model_rolling/task_manager_rolling.py b/examples/model_rolling/task_manager_rolling.py
@@ -8,9 +8,11 @@
 from qlib.workflow.task.gen import RollingGen, task_generator
 from qlib.workflow.task.manage import TaskManager, run_task
 from qlib.workflow.task.collect import RecorderCollector
-from qlib.workflow.task.ensemble import RollingEnsemble
+from qlib.model.ens.ensemble import RollingEnsemble, ens_workflow
 import pandas as pd
 from qlib.workflow.task.utils import list_recorders
+from qlib.model.ens.group import RollingGroup
+from qlib.model.trainer import TrainerRM
 
 data_handler_config = {
     "start_time": "2008-01-01",
@@ -94,39 +96,31 @@ def task_generating():
     return tasks
 
 
-# This part corresponds to "Task Storing" in the document
-def task_storing(tasks, task_pool, exp_name):
-    print("========== task_storing ==========")
-    tm = TaskManager(task_pool=task_pool)
-    tm.create_task(tasks)  # all tasks will be saved to MongoDB
-
-
-# This part corresponds to "Task Running" in the document
-def task_running(task_pool, exp_name):
-    print("========== task_running ==========")
-    run_task(task_train, task_pool, experiment_name=exp_name)  # all tasks will be trained using "task_train" method
+def task_training(tasks, task_pool, exp_name):
+    trainer = TrainerRM()
+    trainer.train(tasks, exp_name, task_pool)
 
 
 # This part corresponds to "Task Collecting" in the document
 def task_collecting(task_pool, exp_name):
     print("========== task_collecting ==========")
 
-    def get_group_key_func(recorder):
+    def rec_key(recorder):
         task_config = recorder.load_object("task")
         model_key = task_config["model"]["class"]
         rolling_key = task_config["dataset"]["kwargs"]["segments"]["test"]
         return model_key, rolling_key
 
     def my_filter(recorder):
         # only choose the results of "LGBModel"
-        model_key, rolling_key = get_group_key_func(recorder)
+        model_key, rolling_key = rec_key(recorder)
         if model_key == "LGBModel":
             return True
         return False
 
-    collector = RecorderCollector(exp_name)
-    # group tasks by "get_task_key" and filter tasks by "my_filter"
-    artifact = collector.collect(RollingEnsemble(), get_group_key_func, rec_filter_func=my_filter)
+    artifact = ens_workflow(
+        RecorderCollector(exp_name=exp_name, rec_key_func=rec_key), RollingGroup(), rec_filter_func=my_filter
+    )
     print(artifact)
 
 
@@ -143,10 +137,9 @@ def main(
     }
     qlib.init(provider_uri=provider_uri, region=REG_CN, mongo=mongo_conf)
 
-    reset(task_pool, exp_name)
-    tasks = task_generating()
-    task_storing(tasks, task_pool, exp_name)
-    task_running(task_pool, exp_name)
+    # reset(task_pool, exp_name)
+    # tasks = task_generating()
+    # task_training(tasks, task_pool, exp_name)
     task_collecting(task_pool, exp_name)
 
 

diff --git a/...svr/task_manager_rolling_with_updating.py → ...srv/task_manager_rolling_with_updating.py b/...svr/task_manager_rolling_with_updating.py → ...srv/task_manager_rolling_with_updating.py
@@ -6,10 +6,10 @@
 from qlib.model.trainer import task_train
 from qlib.workflow import R
 from qlib.workflow.task.collect import RecorderCollector
-from qlib.workflow.task.ensemble import RollingEnsemble
+from qlib.model.ens.ensemble import RollingEnsemble
 from qlib.workflow.task.gen import RollingGen, task_generator
 from qlib.workflow.task.manage import TaskManager, run_task
-from qlib.workflow.task.online import RollingOnlineManager
+from qlib.workflow.online.manager import RollingOnlineManager
 from qlib.workflow.task.utils import list_recorders
 
 data_handler_config = {
@@ -155,10 +155,10 @@ def first_run():
     rolling_online_manager.reset_online_tag(latest_rec.values())
 
 
-def after_day():
+def routine():
     print("========== after_day ==========")
     print_online_model()
-    rolling_online_manager.after_day()
+    rolling_online_manager.routine()
     print_online_model()
     task_collecting()
 

diff --git a/examples/online_svr/update_online_pred.py → examples/online_srv/update_online_pred.py b/examples/online_svr/update_online_pred.py → examples/online_srv/update_online_pred.py
@@ -2,7 +2,7 @@
 import qlib
 from qlib.config import REG_CN
 from qlib.model.trainer import task_train
-from qlib.workflow.task.online import OnlineManagerR
+from qlib.workflow.online.manager import OnlineManagerR
 from qlib.workflow.task.utils import list_recorders
 
 data_handler_config = {
@@ -52,15 +52,15 @@
 }
 
 
-def first_train(experiment_name="online_svr"):
+def first_train(experiment_name="online_srv"):
 
     rid = task_train(task_config=task, experiment_name=experiment_name)
 
     online_manager = OnlineManagerR(experiment_name)
     online_manager.reset_online_tag(rid)
 
 
-def update_online_pred(experiment_name="online_svr"):
+def update_online_pred(experiment_name="online_srv"):
 
     online_manager = OnlineManagerR(experiment_name)
 

diff --git a/qlib/model/ens/ensemble.py b/qlib/model/ens/ensemble.py
@@ -0,0 +1,98 @@
+from abc import abstractmethod
+from typing import Callable, Union
+
+import pandas as pd
+from qlib.workflow.task.collect import Collector
+
+
+def ens_workflow(collector: Collector, process_list, artifacts_key=None, rec_filter_func=None, *args, **kwargs):
+    """the ensemble workflow based on collector and different dict processors.
+
+    Args:
+        collector (Collector): the collector to collect the result into {result_key: things}
+        process_list (list or Callable): the list of processors or the instance of processor to process dict.
+        The processor order is same as the list order.
+
+        For example: [Group1(..., Ensemble1()), Group2(..., Ensemble2())]
+
+        artifacts_key (list, optional): the artifacts key you want to get. If None, get all artifacts.
+        rec_filter_func (Callable, optional): filter the recorder by return True or False. Defaults to None.
+
+    Returns:
+        dict: the ensemble dict
+    """
+    collect_dict = collector.collect(artifacts_key=artifacts_key, rec_filter_func=rec_filter_func)
+    if not isinstance(process_list, list):
+        process_list = [process_list]
+
+    ensemble = {}
+    for artifact in collect_dict:
+        value = collect_dict[artifact]
+        for process in process_list:
+            if not callable(process):
+                raise NotImplementedError(f"{type(process)} is not supported in `ens_workflow`.")
+            value = process(value, *args, **kwargs)
+        ensemble[artifact] = value
+
+    return ensemble
+
+
+class Ensemble:
+    """Merge the objects in an Ensemble."""
+
+    def __init__(self, merge_func=None):
+        """init Ensemble
+
+        Args:
+            merge_func (Callable, optional): Given a dict and return the ensemble.
+
+                For example: {Rollinga_b: object, Rollingb_c: object} -> object
+
+            Defaults to None.
+        """
+        self._merge = merge_func
+
+    def __call__(self, ensemble_dict: dict, *args, **kwargs):
+        """Merge the ensemble_dict into an ensemble object.
+
+        Args:
+            ensemble_dict (dict): the ensemble dict waiting for merging like {name: things}
+
+        Returns:
+            object: the ensemble object
+        """
+        if isinstance(getattr(self, "_merge", None), Callable):
+            return self._merge(ensemble_dict, *args, **kwargs)
+        else:
+            raise NotImplementedError(f"Please specify valid merge_func.")
+
+
+class RollingEnsemble(Ensemble):
+
+    """Merge the rolling objects in an Ensemble"""
+
+    @staticmethod
+    def rolling_merge(rolling_dict: dict):
+        """Merge a dict of rolling dataframe like `prediction` or `IC` into an ensemble.
+
+        NOTE: The values of dict must be pd.Dataframe, and have the index "datetime"
+
+        Args:
+            rolling_dict (dict): a dict like {"A": pd.Dataframe, "B": pd.Dataframe}.
+            The key of the dict will be ignored.
+
+        Returns:
+            pd.Dataframe: the complete result of rolling.
+        """
+        artifact_list = list(rolling_dict.values())
+        artifact_list.sort(key=lambda x: x.index.get_level_values("datetime").min())
+        artifact = pd.concat(artifact_list)
+        # If there are duplicated predition, use the latest perdiction
+        artifact = artifact[~artifact.index.duplicated(keep="last")]
+        artifact = artifact.sort_index()
+        return artifact
+
+    def __init__(self, merge_func=None):
+        super().__init__(merge_func=merge_func)
+        if merge_func is None:
+            self._merge = RollingEnsemble.rolling_merge
diff --git a/qlib/model/ens/group.py b/qlib/model/ens/group.py
@@ -0,0 +1,68 @@
+from qlib.model.ens.ensemble import Ensemble, RollingEnsemble
+from typing import Callable, Union
+
+
+class Group:
+    """Group the objects based on dict"""
+
+    def __init__(self, group_func=None, ens: Ensemble = None):
+        """init Group.
+
+        Args:
+            group_func (Callable, optional): Given a dict and return the group key and one of group elements.
+
+                For example: {(A,B,C1): object, (A,B,C2): object} -> {(A,B): {C1: object, C2: object}}
+
+            Defaults to None.
+
+            ens (Ensemble, optional): If not None, do ensemble for grouped value after grouping.
+        """
+        self._group = group_func
+        self._ens = ens
+
+    def __call__(self, ungrouped_dict: dict, *args, **kwargs):
+        """Group the ungrouped_dict into different groups.
+
+        Args:
+            ungrouped_dict (dict): the ungrouped dict waiting for grouping like {name: things}
+
+        Returns:
+            dict: grouped_dict like {G1: object, G2: object}
+        """
+        if isinstance(getattr(self, "_group", None), Callable):
+            grouped_dict = self._group(ungrouped_dict, *args, **kwargs)
+            if self._ens is not None:
+                ens_dict = {}
+                for key, value in grouped_dict.items():
+                    ens_dict[key] = self._ens(value)
+                grouped_dict = ens_dict
+            return grouped_dict
+        else:
+            raise NotImplementedError(f"Please specify valid merge_func.")
+
+
+class RollingGroup(Group):
+    """group the rolling dict"""
+
+    @staticmethod
+    def rolling_group(rolling_dict: dict):
+        """Given an rolling dict likes {(A,B,R): things}, return the grouped dict likes {(A,B): {R:things}}
+
+        NOTE: There is a assumption which is the rolling key is at the end of key tuple, because the rolling results always need to be ensemble firstly.
+
+        Args:
+            rolling_dict (dict): an rolling dict. If the key is not a tuple, then do nothing.
+
+        Returns:
+            dict: grouped dict
+        """
+        grouped_dict = {}
+        for key, values in rolling_dict.items():
+            if isinstance(key, tuple):
+                grouped_dict.setdefault(key[:-1], {})[key[-1]] = values
+        return grouped_dict
+
+    def __init__(self, group_func=None, ens: Ensemble = RollingEnsemble()):
+        super().__init__(group_func=group_func, ens=ens)
+        if group_func is None:
+            self._group = RollingGroup.rolling_group
diff --git a/qlib/model/trainer.py b/qlib/model/trainer.py
@@ -4,6 +4,7 @@
 from qlib.utils import init_instance_by_config, flatten_dict
 from qlib.workflow import R
 from qlib.workflow.record_temp import SignalRecord
+from qlib.workflow.task.manage import TaskManager, run_task
 
 
 def task_train(task_config: dict, experiment_name: str) -> str:
@@ -57,3 +58,70 @@ def task_train(task_config: dict, experiment_name: str) -> str:
                 ar.generate()
 
     return recorder
+
+
+class Trainer:
+    """
+    The trainer which can train a list of model
+    """
+
+    def train(self, *args, **kwargs):
+        """Given a list of model definition, finished training and return the results of them.
+
+        Returns:
+            list: a list of trained results
+        """
+        raise NotImplementedError(f"Please implement the `train` method.")
+
+
+class TrainerR(Trainer):
+    """Trainer based on (R)ecorder.
+
+    Assumption: models were defined by `task` and the results will saved to `Recorder`
+    """
+
+    def train(self, tasks: list, experiment_name: str, train_func=task_train, *args, **kwargs):
+        """Given a list of `task`s and return a list of trained Recorder. The order can be guaranteed.
+
+        Args:
+            tasks (list): a list of definition based on `task` dict
+            experiment_name (str): the experiment name
+            train_func (Callable): the train method which need at least `task` and `experiment_name`
+
+        Returns:
+            list: a list of Recorders
+        """
+        recs = []
+        for task in tasks:
+            recs.append(train_func(task, experiment_name, *args, **kwargs))
+        return recs
+
+
+class TrainerRM(TrainerR):
+    """Trainer based on (R)ecorder and Task(M)anager
+
+    Assumption: `task` will be saved to TaskManager and `task` will be fetched and trained from TaskManager
+    """
+
+    def train(self, tasks: list, experiment_name: str, task_pool: str, train_func=task_train, *args, **kwargs):
+        """Given a list of `task`s and return a list of trained Recorder. The order can be guaranteed.
+
+        This method defaults to a single process, but TaskManager offered a great way to parallel training.
+        Users can customize their train_func to realize multiple processes or even multiple machines.
+
+        Args:
+            tasks (list): a list of definition based on `task` dict
+            experiment_name (str): the experiment name
+            train_func (Callable): the train method which need at least `task` and `experiment_name`
+
+        Returns:
+            list: a list of Recorders
+        """
+        tm = TaskManager(task_pool=task_pool)
+        _id_list = tm.create_task(tasks)  # all tasks will be saved to MongoDB
+        run_task(train_func, task_pool, experiment_name=experiment_name, *args, **kwargs)
+
+        recs = []
+        for _id in _id_list:
+            recs.append(tm.re_query(_id)["res"])
+        return recs
diff --git a/qlib/workflow/online/__init__.py b/qlib/workflow/online/__init__.py