Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ Once the model is served and ready, you'll see the following output:

```
(venv) $ lab serve
INFO 2024-03-02 02:21:11,352 lab.py:201 Using model 'models/ggml-merlinite-7b-0302-Q4_K_M.gguf' with -1 gpu-layers
INFO 2024-03-02 02:21:11,352 lab.py:201 Using model 'models/ggml-merlinite-7b-0302-Q4_K_M.gguf' with -1 gpu-layers and 4096 max context size.
Starting server process
After application startup complete see http://127.0.0.1:8000/docs for API.
Press CTRL+C to shutdown server.
Expand Down
2 changes: 2 additions & 0 deletions cli/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class _serve:
host_port: str
model_path: str
gpu_layers: int
max_ctx_size: int

def api_base(self):
"""Returns server API URL, based on the configured host and port"""
Expand Down Expand Up @@ -167,6 +168,7 @@ def get_default_config():
host_port=DEFAULT_HOST_PORT,
model_path=DEFAULT_MODEL_PATH,
gpu_layers=-1,
max_ctx_size=4096,
)
return Config(general=general, chat=chat, generate=generate, serve=serve)

Expand Down
22 changes: 18 additions & 4 deletions cli/lab.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,16 +248,30 @@ def check(ctx, taxonomy_path, taxonomy_base):
help="The number of layers to put on the GPU. The rest will be on the CPU. Defaults to -1 to move all to GPU.",
)
@click.option("--num-threads", type=click.INT, help="The number of CPU threads to use")
@click.option(
"--max-ctx-size",
type=click.INT,
help="The context size is the maximum number of tokens considered by the model, for both the prompt and response. Defaults to 4096.",
)
@click.pass_context
def serve(ctx, model_path, gpu_layers, num_threads):
def serve(ctx, model_path, gpu_layers, num_threads, max_ctx_size):
"""Start a local server"""
ctx.obj.logger.info(f"Using model '{model_path}' with {gpu_layers} gpu-layers")
ctx.obj.logger.info(
f"Using model '{model_path}' with {gpu_layers} gpu-layers and {max_ctx_size} max context size."
)

try:
host = ctx.obj.config.serve.host_port.split(":")[0]
port = int(ctx.obj.config.serve.host_port.split(":")[1])
server(ctx.obj.logger, model_path, gpu_layers, num_threads, host, port)

server(
ctx.obj.logger,
model_path,
gpu_layers,
max_ctx_size,
num_threads,
host,
port,
)
except ServerException as exc:
click.secho(f"Error creating server: {exc}", fg="red")
raise click.exceptions.Exit(1)
Expand Down
13 changes: 11 additions & 2 deletions cli/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def ensure_server(logger, serve_config):
"logger": logger,
"model_path": serve_config.model_path,
"gpu_layers": serve_config.gpu_layers,
"max_ctx_size": serve_config.max_ctx_size,
"port": port,
},
daemon=True,
Expand All @@ -50,13 +51,21 @@ def ensure_server(logger, serve_config):
return (server_process, temp_api_base)


def server(logger, model_path, gpu_layers, threads=None, host="localhost", port=8000):
def server(
logger,
model_path,
gpu_layers,
max_ctx_size,
threads=None,
host="localhost",
port=8000,
):
"""Start OpenAI-compatible server"""
settings = Settings(
host=host,
port=port,
model=model_path,
n_ctx=4096,
n_ctx=max_ctx_size,
n_gpu_layers=gpu_layers,
verbose=logger.level == logging.DEBUG,
)
Expand Down
111 changes: 79 additions & 32 deletions scripts/functional-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,20 @@ done

PID1=
PID2=
PID_SERVE=
PID_CHAT=

cleanup() {
set +e
if [ -n "$PID1" ]; then
kill $PID1
fi
if [ -n "$PID2" ]; then
kill $PID2
fi
for pid in $PID1 $PID2 $PID_SERVE $PID_CHAT; do
if [ -n "$pid" ]; then
kill $pid
fi
done
rm -f test_ctx_size_lab_serve_output.txt
}

trap cleanup 0
trap cleanup EXIT QUIT INT TERM

rm -f config.yaml

Expand All @@ -35,37 +37,82 @@ echo -e "\n\n\n" | lab init
lab download

# check that lab serve is working
expect -c '
spawn lab serve
expect {
"http://localhost:8000/docs" { exit 0 }
eof
test_bind_port(){
expect -c '
spawn lab serve
expect {
"http://localhost:8000/docs" { exit 0 }
eof
}

python -m http.server 8000 &
PID1=$!

# check that lab serve is detecting the port is already in use
# catch "error while attempting to bind on address ...."
spawn lab serve
expect {
"error while attempting to bind on address " { exit 1 }
eof
}

# configure a different port
sed -i 's/8000/9999/g' config.yaml

# check that lab serve is working on the new port
# catch ERROR strings in the output
spawn lab serve
expect {
"http://localhost:9999/docs" { exit 0}
eof
}
'
python -m http.server 9999 &
PID2=$!
}

python -m http.server 8000 &
PID1=$!
test_ctx_size(){
lab serve --max-ctx-size 1 > test_ctx_size_lab_serve_output.txt 2>&1 &
PID_SERVE=$!

# check that lab serve is detecting the port is already in use
# catch "error while attempting to bind on address ...."
spawn lab serve
expect {
"error while attempting to bind on address " { exit 1 }
eof
# the error is expected so let's ignore it to not fall into the trap
set +e
# now chat with the server
lab chat <<EOF &
hello
EOF
PID_CHAT=$!
# re-activate the error trap
set -e

# wait a bit for the pid directory to disappear
for i in $(seq 1 20); do
if ! test -d /proc/$PID_CHAT; then
break
fi
# error if the process is still running after 10 seconds
if [ $i -eq 20 ]; then
echo "chat process is still running"
exit 1
fi
sleep 1
done
# reset the PID_CHAT variable so that the cleanup function doesn't try to kill it
PID_CHAT=

# look for the context size error in the server logs
if ! grep -q "exceed context window of" test_ctx_size_lab_serve_output.txt; then
echo "context size error not found"
exit 1
fi
}

# configure a different port
sed -i 's/8000/9999/g' config.yaml

# check that lab serve is working on the new port
# catch ERROR strings in the output
spawn lab serve
expect {
"http://localhost:9999/docs" { exit 0}
eof
}
'

python -m http.server 9999 &
PID2=$!
########
# MAIN #
########
test_bind_port
test_ctx_size

exit 0
1 change: 1 addition & 0 deletions tests/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class Generate(BaseModel):

class Serve(BaseModel):
gpu_layers: int
max_ctx_size: int
model_path: str


Expand Down
3 changes: 3 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
output_dir: /tmp
serve:
gpu_layers: -1
max_ctx_size: 4096
model_path: models/ggml-merlinite-7b-0302-Q4_K_M.gguf
host_port: localhost:8000
"""
Expand Down Expand Up @@ -54,6 +55,7 @@
output_dir: /tmp
serve:
gpu_layers: -1
max_ctx_size: 4096
model_path: models/ggml-merlinite-7b-0302-Q4_K_M.gguf
"""

Expand All @@ -74,6 +76,7 @@ def test_config(self):
assert cfg is not None
assert cfg.serve is not None
assert cfg.serve.gpu_layers == -1
assert cfg.serve.max_ctx_size == 4096
assert cfg.serve.model_path == "models/ggml-merlinite-7b-0302-Q4_K_M.gguf"
assert cfg.chat.context == "default"
assert cfg.chat.model == "ggml-merlinite-7b-0302-Q4_K_M"
Expand Down