instructlab · hickeyma · Mar 13, 2024 · Mar 12, 2024
diff --git a/README.md b/README.md
@@ -141,7 +141,7 @@ Once the model is served and ready, you'll see the following output:
 
 ```
 (venv) $ lab serve
-INFO 2024-03-02 02:21:11,352 lab.py:201 Using model 'models/ggml-merlinite-7b-0302-Q4_K_M.gguf' with -1 gpu-layers
+INFO 2024-03-02 02:21:11,352 lab.py:201 Using model 'models/ggml-merlinite-7b-0302-Q4_K_M.gguf' with -1 gpu-layers and 4096 max context size.
 Starting server process
 After application startup complete see http://127.0.0.1:8000/docs for API.
 Press CTRL+C to shutdown server.

diff --git a/cli/config.py b/cli/config.py
@@ -65,6 +65,7 @@ class _serve:
     host_port: str
     model_path: str
     gpu_layers: int
+    max_ctx_size: int
 
     def api_base(self):
         """Returns server API URL, based on the configured host and port"""
@@ -167,6 +168,7 @@ def get_default_config():
         host_port=DEFAULT_HOST_PORT,
         model_path=DEFAULT_MODEL_PATH,
         gpu_layers=-1,
+        max_ctx_size=4096,
     )
     return Config(general=general, chat=chat, generate=generate, serve=serve)
 

diff --git a/cli/lab.py b/cli/lab.py
@@ -248,16 +248,30 @@ def check(ctx, taxonomy_path, taxonomy_base):
     help="The number of layers to put on the GPU. The rest will be on the CPU. Defaults to -1 to move all to GPU.",
 )
 @click.option("--num-threads", type=click.INT, help="The number of CPU threads to use")
+@click.option(
+    "--max-ctx-size",
+    type=click.INT,
+    help="The context size is the maximum number of tokens considered by the model, for both the prompt and response. Defaults to 4096.",
+)
 @click.pass_context
-def serve(ctx, model_path, gpu_layers, num_threads):
+def serve(ctx, model_path, gpu_layers, num_threads, max_ctx_size):
     """Start a local server"""
-    ctx.obj.logger.info(f"Using model '{model_path}' with {gpu_layers} gpu-layers")
+    ctx.obj.logger.info(
+        f"Using model '{model_path}' with {gpu_layers} gpu-layers and {max_ctx_size} max context size."
+    )
 
     try:
         host = ctx.obj.config.serve.host_port.split(":")[0]
         port = int(ctx.obj.config.serve.host_port.split(":")[1])
-        server(ctx.obj.logger, model_path, gpu_layers, num_threads, host, port)
-
+        server(
+            ctx.obj.logger,
+            model_path,
+            gpu_layers,
+            max_ctx_size,
+            num_threads,
+            host,
+            port,
+        )
     except ServerException as exc:
         click.secho(f"Error creating server: {exc}", fg="red")
         raise click.exceptions.Exit(1)

diff --git a/cli/server.py b/cli/server.py
@@ -42,6 +42,7 @@ def ensure_server(logger, serve_config):
                 "logger": logger,
                 "model_path": serve_config.model_path,
                 "gpu_layers": serve_config.gpu_layers,
+                "max_ctx_size": serve_config.max_ctx_size,
                 "port": port,
             },
             daemon=True,
@@ -50,13 +51,21 @@ def ensure_server(logger, serve_config):
         return (server_process, temp_api_base)
 
 
-def server(logger, model_path, gpu_layers, threads=None, host="localhost", port=8000):
+def server(
+    logger,
+    model_path,
+    gpu_layers,
+    max_ctx_size,
+    threads=None,
+    host="localhost",
+    port=8000,
+):
     """Start OpenAI-compatible server"""
     settings = Settings(
         host=host,
         port=port,
         model=model_path,
-        n_ctx=4096,
+        n_ctx=max_ctx_size,
         n_gpu_layers=gpu_layers,
         verbose=logger.level == logging.DEBUG,
     )

diff --git a/scripts/functional-tests.sh b/scripts/functional-tests.sh
@@ -13,18 +13,20 @@ done
 
 PID1=
 PID2=
+PID_SERVE=
+PID_CHAT=
 
 cleanup() {
     set +e
-    if [ -n "$PID1" ]; then
-        kill $PID1
-    fi
-    if [ -n "$PID2" ]; then
-        kill $PID2
-    fi
+    for pid in $PID1 $PID2 $PID_SERVE $PID_CHAT; do
+        if [ -n "$pid" ]; then
+            kill $pid
+        fi
+    done
+    rm -f test_ctx_size_lab_serve_output.txt
 }
 
-trap cleanup 0
+trap cleanup EXIT QUIT INT TERM
 
 rm -f config.yaml
 
@@ -35,37 +37,82 @@ echo -e "\n\n\n" | lab init
 lab download
 
 # check that lab serve is working
-expect -c '
-spawn lab serve
-expect {
-    "http://localhost:8000/docs" { exit 0 }
-    eof
+test_bind_port(){
+    expect -c '
+    spawn lab serve
+    expect {
+        "http://localhost:8000/docs" { exit 0 }
+        eof
+    }
+
+    python -m http.server 8000 &
+    PID1=$!
+
+    # check that lab serve is detecting the port is already in use
+    # catch "error while attempting to bind on address ...."
+    spawn lab serve
+    expect {
+        "error while attempting to bind on address " { exit 1 }
+        eof
+    }
+
+    # configure a different port
+    sed -i 's/8000/9999/g' config.yaml
+
+    # check that lab serve is working on the new port
+    # catch ERROR strings in the output
+    spawn lab serve
+    expect {
+        "http://localhost:9999/docs" { exit 0}
+        eof
+    }
+'
+    python -m http.server 9999 &
+    PID2=$!
 }
 
-python -m http.server 8000 &
-PID1=$!
+test_ctx_size(){
+    lab serve --max-ctx-size 1 > test_ctx_size_lab_serve_output.txt 2>&1 &
+    PID_SERVE=$!
 
-# check that lab serve is detecting the port is already in use
-# catch "error while attempting to bind on address ...."
-spawn lab serve
-expect {
-    "error while attempting to bind on address " { exit 1 }
-    eof
+    # the error is expected so let's ignore it to not fall into the trap
+    set +e
+    # now chat with the server
+    lab chat <<EOF &
+hello
+EOF
+    PID_CHAT=$!
+    # re-activate the error trap
+    set -e
+
+    # wait a bit for the pid directory to disappear
+    for i in $(seq 1 20); do
+        if ! test -d /proc/$PID_CHAT; then
+            break
+        fi
+        # error if the process is still running after 10 seconds
+        if [ $i -eq 20 ]; then
+            echo "chat process is still running"
+            exit 1
+        fi
+        sleep 1
+    done
+    # reset the PID_CHAT variable so that the cleanup function doesn't try to kill it
+    PID_CHAT=
+
+    # look for the context size error in the server logs
+    if ! grep -q "exceed context window of" test_ctx_size_lab_serve_output.txt; then
+        echo "context size error not found"
+        exit 1
+    fi
 }
 
-# configure a different port
-sed -i 's/8000/9999/g' config.yaml
 
-# check that lab serve is working on the new port
-# catch ERROR strings in the output
-spawn lab serve
-expect {
-    "http://localhost:9999/docs" { exit 0}
-    eof
-}
-'
 
-python -m http.server 9999 &
-PID2=$!
+########
+# MAIN #
+########
+test_bind_port
+test_ctx_size
 
 exit 0
diff --git a/tests/schema.py b/tests/schema.py
@@ -37,6 +37,7 @@ class Generate(BaseModel):
 
 class Serve(BaseModel):
     gpu_layers: int
+    max_ctx_size: int
     model_path: str
 
 

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -26,6 +26,7 @@
   output_dir: /tmp
 serve:
   gpu_layers: -1
+  max_ctx_size: 4096
   model_path: models/ggml-merlinite-7b-0302-Q4_K_M.gguf
   host_port: localhost:8000
 """
@@ -54,6 +55,7 @@
   output_dir: /tmp
 serve:
   gpu_layers: -1
+  max_ctx_size: 4096
   model_path: models/ggml-merlinite-7b-0302-Q4_K_M.gguf
 """
 
@@ -74,6 +76,7 @@ def test_config(self):
         assert cfg is not None
         assert cfg.serve is not None
         assert cfg.serve.gpu_layers == -1
+        assert cfg.serve.max_ctx_size == 4096
         assert cfg.serve.model_path == "models/ggml-merlinite-7b-0302-Q4_K_M.gguf"
         assert cfg.chat.context == "default"
         assert cfg.chat.model == "ggml-merlinite-7b-0302-Q4_K_M"