Thanks to visit codestin.com
Credit goes to github.com

Skip to content

stream api will lose data when use sslsocket #2414

Open
@Novelfor

Description

@Novelfor

What happened (please include outputs or screenshots):
I use stream api to implement "kubectl exec", when i test my code, sometimes it will lose data. After debuging, i think the reason is kubernete client handle non blocking sslsocket error.

According to the Python documentation on SSL Sockets (https://docs.python.org/3/library/ssl.html#ssl-nonblocking), "SSL socket may still have data available for reading without select() being aware of it".

Current code:

def update(self, timeout=0):

Can FIX it use sock.pending function

    def update(self, timeout=0):
        """Update channel buffers with at most one complete frame of input."""
        if not self.is_open():
            return
        if not self.sock.connected:
            self._connected = False
            return

        # The options here are:
        # select.select() - this will work on most OS, however, it has a
        #                   limitation of only able to read fd numbers up to 1024.
        #                   i.e. does not scale well. This was the original
        #                   implementation.
        # select.poll()   - this will work on most unix based OS, but not as
        #                   efficient as epoll. Will work for fd numbers above 1024.
        # select.epoll()  - newest and most efficient way of polling.
        #                   However, only works on linux.
        ssl_pending = 0
        if self.sock.is_ssl():
            ssl_pending = self.sock.sock.pending()

        if hasattr(select, "poll"):
            poll = select.poll()
            poll.register(self.sock.sock, select.POLLIN)
            if timeout is not None:
                timeout *= 1_000  # poll method uses milliseconds as the time unit
            r = poll.poll(timeout)
            poll.unregister(self.sock.sock)
        else:
            r, _, _ = select.select(
                (self.sock.sock, ), (), (), timeout)

        if r or ssl_pending > 0:

What you expected to happen:

How to reproduce it (as minimally and precisely as possible):
Simple test to exec vim, it will lose cursor easily.. but should write a simple tty, here is my code. vim\htop can easily reproduce the data lose.

from kubernetes.stream import ws_client
import termios
import pty
import fcntl
import struct
import json
import signal
import tty
import select
import os
import threading
import yaml
import sys
import time
from datetime import datetime, timezone

def append_file(data):
    with open("test.txt", "a") as f:
        now = datetime.now(tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
        f.write(f"{now} {data}\n")

class InteractiveShell:
    def __init__(self, client: ws_client.WSClient, has_stdin=True, has_tty=True, outfile=None, errfile=None):
        self.client = client
        self.has_stdin = has_stdin
        self.has_tty = has_tty
        self.master_fd = None
        self.keep_ping = threading.Thread(target=self._keep_ping, daemon=True)
        self.keep_ping.start()
        if errfile is None:
            self.errfile = pty.STDERR_FILENO
        else:
            self.errfile = errfile
        if outfile is None:
            self.outfile = pty.STDOUT_FILENO
        else:
            self.outfile = outfile

    def _keep_ping(self):
        while True:
            try:
                self.client.write_channel(6, "ping")
                time.sleep(60 * 10)
            except Exception as e:
                break

    def _set_pty_size(self, a=None, b=None):
        """
        Sets the window size of the child pty based on the window size of
               our own controlling terminal.
        """
        if not self.has_tty:
            return
        packed = fcntl.ioctl(pty.STDOUT_FILENO,
                             termios.TIOCGWINSZ,
                             struct.pack('HHHH', 0, 0, 0, 0))
        rows, cols, h_pixels, v_pixels = struct.unpack('HHHH', packed)
        self.client.write_channel(ws_client.RESIZE_CHANNEL, json.dumps({"Height": rows, "Width": cols}))

    def spawn(self, argv=None):
        if self.has_tty:
            old_handler = signal.signal(signal.SIGWINCH, self._set_pty_size)
            try:
                self.old_settings = tty.tcgetattr(pty.STDIN_FILENO)
                tty.setraw(pty.STDIN_FILENO)
            except tty.error:
                pass
        self._set_pty_size()
        ret_data = None
        returncode = -1
        try:
            ret_data = self.main_loop()
        finally:
            if self.has_tty:
                termios.tcsetattr(sys.stdin, termios.TCSADRAIN, self.old_settings)
            if ret_data is None:
                err = self.client.read_channel(ws_client.ERROR_CHANNEL)
                ret_data = yaml.safe_load(err)
            if ret_data is None or ret_data['status'] == "Success":
                returncode = 0
            else:
                returncode = int(ret_data['details']['causes'][0]['message'])
        return returncode
    
    def forward_stdin_thread(self):
        while True:
            rfds, _, _ = select.select([pty.STDIN_FILENO], [], [], 1.0)
            if len(rfds) == 0:
                continue
            
            if pty.STDIN_FILENO in rfds and self.has_stdin:
                data = os.read(pty.STDIN_FILENO, 1024)
                append_file(f"STDIN: {data}")
                if data:
                    if data == b"0":
                        termios.tcsetattr(sys.stdin, termios.TCSADRAIN, self.old_settings)
                        from IPython import embed
                        embed()
                        op_code, rdata = self.client.sock.recv_data_frame(False)
                        append_file(f"0 received, op_code: {op_code}, data: {rdata}")
                    self.client.write_stdin(data)
                else:
                    break

    def main_loop(self):
        forward_thread = threading.Thread(target=self.forward_stdin_thread, daemon=True)
        forward_thread.start()
        while True:
            self.client.update(timeout=1.0)
            if self.client.peek_channel(ws_client.STDOUT_CHANNEL):
                data = self.client.read_channel(ws_client.STDOUT_CHANNEL)
                if data:
                    append_file(f"STDOUT: {data}\n")
                    self.write_stdout(data)
            elif self.client.peek_channel(ws_client.STDERR_CHANNEL):
                error_data = self.client.read_channel(ws_client.STDERR_CHANNEL)
                if error_data:
                    self.write_stderr(error_data)
            elif self.client.peek_channel(ws_client.RESIZE_CHANNEL):
                resize_data = self.client.read_channel(ws_client.RESIZE_CHANNEL)
                if resize_data:
                    resize_info = json.loads(resize_data)
                    rows = resize_info.get("Height", 24)
                    cols = resize_info.get("Width", 80)
            elif self.client.peek_channel(ws_client.ERROR_CHANNEL):
                error_data = self.client.read_channel(ws_client.ERROR_CHANNEL)
                error_msg = yaml.safe_load(error_data)
                return error_msg

    def write_stdout(self, data):
        os.write(self.outfile, data)

    def write_stderr(self, data):
        os.write(self.errfile, data)

    def forward_stdin(self, data):
        assert self.client is not None
        self.client.write_stdin(data)

if __name__ == "__main__":
    from kubernetes import client, config
    from kubernetes.stream import stream
    pod_name = "tlaunch-4fc21ba6-0"
    namespace = "ws-xingyuan"

    config.load_kube_config("/etc/kube.conf")
    core_api = client.CoreV1Api()

    client = stream(
        core_api.connect_get_namespaced_pod_exec,
        name=pod_name,
        namespace=namespace,
        command="zsh",
        stderr=True,
        stdin=True,
        stdout=True,
        tty=True,
        _preload_content=False,
        binary=True
    )

    shell = InteractiveShell(client, has_stdin=True, has_tty=True)
    return_code = shell.spawn()

I also write a test.py, but it's difficult to reproduce. Use exec to run test code, input any key, send time to host, sometimes (almost 30~50 times, depends on host cpu speed) it delay receving the data until next input.

from datetime import datetime, timezone
import sys

if __name__ == "__main__":
    try:
        while True:
            input()
            print(datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S.%f") + "padding test", flush=True)
    except KeyboardInterrupt:
        sys.exit(254)

Anything else we need to know?:

Environment:

Metadata

Metadata

Assignees

No one assigned

    Labels

    kind/bugCategorizes issue or PR as related to a bug.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions