Improve docs & lint

claudiosv · claudiosv · commit e4b5ef200889 · 2025-07-02T15:40:59.000+02:00
Signed-off-by: Claudio Spiess &lt;claudiosv@users.noreply.github.com&gt;
diff --git a/docs/README.md b/docs/README.md
@@ -23,7 +23,7 @@ PDL provides the following features:
 
 The PDL interpreter takes a PDL program as input and generates data by executing its instructions (calling out to models, code, etc...).
 
-See below for a quick reference, followed by [installation notes](#interpreter_installation) and an [overview](#overview) of the language. A more detailed description of the language features can be found in this [tutorial](https://ibm.github.io/prompt-declaration-language/tutorial).
+See below for a quick reference, followed by [installation notes](#interpreter-installation) and an [overview](#overview) of the language. A more detailed description of the language features can be found in this [tutorial](https://ibm.github.io/prompt-declaration-language/tutorial).
 
 
 ## Quick Reference
@@ -50,13 +50,13 @@ pip install 'prompt-declaration-language[examples]'
 
 The Live Explorer can be installed as follows (MacOS):
 ```
-brew install pdl 
+brew install pdl
 ```
 
 For other platforms, see installation notes.
 
 You can run PDL with LLM models in local using [Ollama](https://ollama.com), or other cloud service.
-See [here](https://ibm.github.io/prompt-declaration-language/tutorial/#using-ollama-models) for 
+See [here](https://ibm.github.io/prompt-declaration-language/tutorial/#using-ollama-models) for
 instructions on how to install an Ollama model locally.
 
 Most examples in this repository use IBM Granite models on [Ollama](https://ollama.com) and some are on [Replicate](https://replicate.com/). In order to run these examples, you need to create a free account
@@ -172,7 +172,7 @@ text:
     temperature: 0
 ```
 
-Notice the syntactic differences. Model ids on watsonx start with `watsonx`. 
+Notice the syntactic differences. Model ids on watsonx start with `watsonx`.
 
 Watsonx also provides a text completion endpoint as shown in the following example. A text completion endpoint does not take chat
 templates into account:
@@ -299,10 +299,10 @@ When we execute this program with the PDL interpreter, we obtain the following t
 @SuppressWarnings("unchecked")
 public static Map<String, String> deserializeOffsetMap(String lastSourceOffset) throws IOException {
   Map<String, String> offsetMap;
-  if (lastSourceOffset == null || lastSourceOffset.isEmpty()) {    
-    offsetMap = new HashMap<>();  
+  if (lastSourceOffset == null || lastSourceOffset.isEmpty()) {
+    offsetMap = new HashMap<>();
   } else {
-    offsetMap = JSON_MAPPER.readValue(lastSourceOffset, Map.class);  
+    offsetMap = JSON_MAPPER.readValue(lastSourceOffset, Map.class);
   }
   return offsetMap;
 }
@@ -364,10 +364,10 @@ When we execute this new program, we obtain the following:
 @SuppressWarnings("unchecked")
 public static Map<String, String> deserializeOffsetMap(String lastSourceOffset) throws IOException {
   Map<String, String> offsetMap;
-  if (lastSourceOffset == null || lastSourceOffset.isEmpty()) {    
-    offsetMap = new HashMap<>();  
+  if (lastSourceOffset == null || lastSourceOffset.isEmpty()) {
+    offsetMap = new HashMap<>();
   } else {
-    offsetMap = JSON_MAPPER.readValue(lastSourceOffset, Map.class);  
+    offsetMap = JSON_MAPPER.readValue(lastSourceOffset, Map.class);
   }
   return offsetMap;
 }
diff --git a/docs/autopdl.md b/docs/autopdl.md
@@ -173,4 +173,4 @@ This will report details about the optimization process, such as the number of c
    0% ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0/1,200  [ 0:00:01 < -:--:-- , ? it/s ]
 ```
 
-Once the process is complete, a file `optimized_gsm8k.pdl` is written in same directory as the source PDL file. This file contains the optimal configuration and is directly executable by the standard PDL interpreter. A log of the optimization process is written to `experiments/` by default.
+Once the process is complete, a file `optimized_gsm8k.pdl` is written in same directory as the source PDL file. This file contains the optimal configuration and is directly executable by the standard PDL interpreter. A log of the optimization process is written to `experiments/` by default.
diff --git a/examples/optimizer/optimize.py b/examples/optimizer/optimize.py
@@ -5,7 +5,7 @@
 from typing import Any
 
 import yaml
-from datasets import load_from_disk
+from datasets.load import load_from_disk
 from fever_evaluator import FEVEREvaluator
 from gsm8k_evaluator import Gsm8kEvaluator
 from gsmhard_evaluator import GsmHardEvaluator
diff --git a/src/pdl/optimize/pdl_optimizer.py b/src/pdl/optimize/pdl_optimizer.py
@@ -160,10 +160,13 @@ def sample_candidates(
         demo_name = self.config.demonstrations_variable_name
         candidates = []
 
+        num_demonstrations_set = {
+            int(x) for x in self.config.variables.get("num_demonstrations", set())
+        }
+
         if (
-            "prompt_pattern" in self.config.variables
-            and "cot" in self.config.variables.get("prompt_pattern", [])
-            and 0 in self.config.variables.get("num_demonstrations", [])
+            "cot" in self.config.variables.get("prompt_pattern", [])
+            and 0 in num_demonstrations_set
         ):
             cot_candidate = {
                 k: self.sample_random_index(v) for k, v in self.config.variables.items()
@@ -179,18 +182,18 @@ def sample_candidates(
 
             candidates.append(cot_candidate)
 
-        zero_shots_seen = ["cot"]
+        zero_shots_seen = {"cot"}
         while len(candidates) < num_candidates:
             variable_instance = {
                 k: self.sample_random_index(v) for k, v in self.config.variables.items()
             }
             if (
                 variable_instance.get("num_demonstrations") == 0
-                and variable_instance.get("prompt_pattern") == "cot"
+                and variable_instance.get("prompt_pattern") is not None
             ):
                 if variable_instance["prompt_pattern"] in zero_shots_seen:
                     continue
-                zero_shots_seen.append(variable_instance["prompt_pattern"])
+                zero_shots_seen.add(variable_instance["prompt_pattern"])
 
             num_demonstrations = int(
                 variable_instance.get("num_demonstrations", self.num_demonstrations),
@@ -215,16 +218,26 @@ def sample_candidates(
             candidates.append(candidate)
 
         if (
-            "num_demonstrations"
-            in self.config.variables  # check if is variable in config
-            and len(self.config.variables["num_demonstrations"])
-            > 1  # check more than 1 option
-            and 0 in [int(x) for x in self.config.variables["num_demonstrations"]]
-            # check zeroshot is an option
+            len(num_demonstrations_set) > 1  # check more than 1 option
+            and 0 in num_demonstrations_set  # check zeroshot is an option
         ):
-            zero_shotters = [x for x in candidates if x["num_demonstrations"] == 0]
+            zero_shotters = [
+                x.get("uuid") for x in candidates if x.get("num_demonstrations") == 0
+            ]
+            variables_zs = self.config.variables.copy()
+            variables_zs.pop("num_demonstrations", None)
+
+            max_zs = len(list(itertools.product(*variables_zs.values())))
+
+            if len(zero_shotters) > max_zs:
+                logger.warning(
+                    "More zero-shot candidates (%d) than expected (%d; "
+                    "product of all variables). "
+                    "Identical duplicated candidates may waste compute.",
+                    len(zero_shotters),
+                    max_zs,
+                )
 
-            assert len(zero_shotters) <= 3
         assert len(candidates) == num_candidates
         return candidates