Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 67cc196

Browse files
authored
feat: pubsub reports dropped messages (coder#7660)
* Implementation; need linux tests Signed-off-by: Spike Curtis <[email protected]> * Pubsub with errors tests and fixes Signed-off-by: Spike Curtis <[email protected]> * Deal with test goroutines Signed-off-by: Spike Curtis <[email protected]> --------- Signed-off-by: Spike Curtis <[email protected]>
1 parent 6a1e7ee commit 67cc196

File tree

6 files changed

+522
-52
lines changed

6 files changed

+522
-52
lines changed

coderd/database/postgres/postgres.go

+15-7
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ import (
2222
// Super unlikely, but it happened. See: https://github.com/coder/coder/runs/5375197003
2323
var openPortMutex sync.Mutex
2424

25-
// Open creates a new PostgreSQL server using a Docker container.
25+
// Open creates a new PostgreSQL database instance. With DB_FROM environment variable set, it clones a database
26+
// from the provided template. With the environment variable unset, it creates a new Docker container running postgres.
2627
func Open() (string, func(), error) {
2728
if os.Getenv("DB_FROM") != "" {
2829
// In CI, creating a Docker container for each test is slow.
@@ -51,7 +52,12 @@ func Open() (string, func(), error) {
5152
// so cleaning up the container will clean up the database.
5253
}, nil
5354
}
55+
return OpenContainerized(0)
56+
}
5457

58+
// OpenContainerized creates a new PostgreSQL server using a Docker container. If port is nonzero, forward host traffic
59+
// to that port to the database. If port is zero, allocate a free port from the OS.
60+
func OpenContainerized(port int) (string, func(), error) {
5561
pool, err := dockertest.NewPool("")
5662
if err != nil {
5763
return "", nil, xerrors.Errorf("create pool: %w", err)
@@ -63,12 +69,14 @@ func Open() (string, func(), error) {
6369
}
6470

6571
openPortMutex.Lock()
66-
// Pick an explicit port on the host to connect to 5432.
67-
// This is necessary so we can configure the port to only use ipv4.
68-
port, err := getFreePort()
69-
if err != nil {
70-
openPortMutex.Unlock()
71-
return "", nil, xerrors.Errorf("get free port: %w", err)
72+
if port == 0 {
73+
// Pick an explicit port on the host to connect to 5432.
74+
// This is necessary so we can configure the port to only use ipv4.
75+
port, err = getFreePort()
76+
if err != nil {
77+
openPortMutex.Unlock()
78+
return "", nil, xerrors.Errorf("get free port: %w", err)
79+
}
7280
}
7381

7482
resource, err := pool.RunWithOptions(&dockertest.RunOptions{

coderd/database/pubsub.go

+170-35
Original file line numberDiff line numberDiff line change
@@ -15,29 +15,174 @@ import (
1515
// Listener represents a pubsub handler.
1616
type Listener func(ctx context.Context, message []byte)
1717

18+
// ListenerWithErr represents a pubsub handler that can also receive error
19+
// indications
20+
type ListenerWithErr func(ctx context.Context, message []byte, err error)
21+
22+
// ErrDroppedMessages is sent to ListenerWithErr if messages are dropped or
23+
// might have been dropped.
24+
var ErrDroppedMessages = xerrors.New("dropped messages")
25+
1826
// Pubsub is a generic interface for broadcasting and receiving messages.
1927
// Implementors should assume high-availability with the backing implementation.
2028
type Pubsub interface {
2129
Subscribe(event string, listener Listener) (cancel func(), err error)
30+
SubscribeWithErr(event string, listener ListenerWithErr) (cancel func(), err error)
2231
Publish(event string, message []byte) error
2332
Close() error
2433
}
2534

35+
// msgOrErr either contains a message or an error
36+
type msgOrErr struct {
37+
msg []byte
38+
err error
39+
}
40+
41+
// msgQueue implements a fixed length queue with the ability to replace elements
42+
// after they are queued (but before they are dequeued).
43+
//
44+
// The purpose of this data structure is to build something that works a bit
45+
// like a golang channel, but if the queue is full, then we can replace the
46+
// last element with an error so that the subscriber can get notified that some
47+
// messages were dropped, all without blocking.
48+
type msgQueue struct {
49+
ctx context.Context
50+
cond *sync.Cond
51+
q [PubsubBufferSize]msgOrErr
52+
front int
53+
size int
54+
closed bool
55+
l Listener
56+
le ListenerWithErr
57+
}
58+
59+
func newMsgQueue(ctx context.Context, l Listener, le ListenerWithErr) *msgQueue {
60+
if l == nil && le == nil {
61+
panic("l or le must be non-nil")
62+
}
63+
q := &msgQueue{
64+
ctx: ctx,
65+
cond: sync.NewCond(&sync.Mutex{}),
66+
l: l,
67+
le: le,
68+
}
69+
go q.run()
70+
return q
71+
}
72+
73+
func (q *msgQueue) run() {
74+
for {
75+
// wait until there is something on the queue or we are closed
76+
q.cond.L.Lock()
77+
for q.size == 0 && !q.closed {
78+
q.cond.Wait()
79+
}
80+
if q.closed {
81+
q.cond.L.Unlock()
82+
return
83+
}
84+
item := q.q[q.front]
85+
q.front = (q.front + 1) % PubsubBufferSize
86+
q.size--
87+
q.cond.L.Unlock()
88+
89+
// process item without holding lock
90+
if item.err == nil {
91+
// real message
92+
if q.l != nil {
93+
q.l(q.ctx, item.msg)
94+
continue
95+
}
96+
if q.le != nil {
97+
q.le(q.ctx, item.msg, nil)
98+
continue
99+
}
100+
// unhittable
101+
continue
102+
}
103+
// if the listener wants errors, send it.
104+
if q.le != nil {
105+
q.le(q.ctx, nil, item.err)
106+
}
107+
}
108+
}
109+
110+
func (q *msgQueue) enqueue(msg []byte) {
111+
q.cond.L.Lock()
112+
defer q.cond.L.Unlock()
113+
114+
if q.size == PubsubBufferSize {
115+
// queue is full, so we're going to drop the msg we got called with.
116+
// We also need to record that messages are being dropped, which we
117+
// do at the last message in the queue. This potentially makes us
118+
// lose 2 messages instead of one, but it's more important at this
119+
// point to warn the subscriber that they're losing messages so they
120+
// can do something about it.
121+
back := (q.front + PubsubBufferSize - 1) % PubsubBufferSize
122+
q.q[back].msg = nil
123+
q.q[back].err = ErrDroppedMessages
124+
return
125+
}
126+
// queue is not full, insert the message
127+
next := (q.front + q.size) % PubsubBufferSize
128+
q.q[next].msg = msg
129+
q.q[next].err = nil
130+
q.size++
131+
q.cond.Broadcast()
132+
}
133+
134+
func (q *msgQueue) close() {
135+
q.cond.L.Lock()
136+
defer q.cond.L.Unlock()
137+
defer q.cond.Broadcast()
138+
q.closed = true
139+
}
140+
141+
// dropped records an error in the queue that messages might have been dropped
142+
func (q *msgQueue) dropped() {
143+
q.cond.L.Lock()
144+
defer q.cond.L.Unlock()
145+
146+
if q.size == PubsubBufferSize {
147+
// queue is full, but we need to record that messages are being dropped,
148+
// which we do at the last message in the queue. This potentially drops
149+
// another message, but it's more important for the subscriber to know.
150+
back := (q.front + PubsubBufferSize - 1) % PubsubBufferSize
151+
q.q[back].msg = nil
152+
q.q[back].err = ErrDroppedMessages
153+
return
154+
}
155+
// queue is not full, insert the error
156+
next := (q.front + q.size) % PubsubBufferSize
157+
q.q[next].msg = nil
158+
q.q[next].err = ErrDroppedMessages
159+
q.size++
160+
q.cond.Broadcast()
161+
}
162+
26163
// Pubsub implementation using PostgreSQL.
27164
type pgPubsub struct {
28165
ctx context.Context
29166
pgListener *pq.Listener
30167
db *sql.DB
31168
mut sync.Mutex
32-
listeners map[string]map[uuid.UUID]chan<- []byte
169+
queues map[string]map[uuid.UUID]*msgQueue
33170
}
34171

35-
// messageBufferSize is the maximum number of unhandled messages we will buffer
172+
// PubsubBufferSize is the maximum number of unhandled messages we will buffer
36173
// for a subscriber before dropping messages.
37-
const messageBufferSize = 2048
174+
const PubsubBufferSize = 2048
38175

39176
// Subscribe calls the listener when an event matching the name is received.
40177
func (p *pgPubsub) Subscribe(event string, listener Listener) (cancel func(), err error) {
178+
return p.subscribeQueue(event, newMsgQueue(p.ctx, listener, nil))
179+
}
180+
181+
func (p *pgPubsub) SubscribeWithErr(event string, listener ListenerWithErr) (cancel func(), err error) {
182+
return p.subscribeQueue(event, newMsgQueue(p.ctx, nil, listener))
183+
}
184+
185+
func (p *pgPubsub) subscribeQueue(event string, newQ *msgQueue) (cancel func(), err error) {
41186
p.mut.Lock()
42187
defer p.mut.Unlock()
43188

@@ -50,23 +195,20 @@ func (p *pgPubsub) Subscribe(event string, listener Listener) (cancel func(), er
50195
return nil, xerrors.Errorf("listen: %w", err)
51196
}
52197

53-
var eventListeners map[uuid.UUID]chan<- []byte
198+
var eventQs map[uuid.UUID]*msgQueue
54199
var ok bool
55-
if eventListeners, ok = p.listeners[event]; !ok {
56-
eventListeners = make(map[uuid.UUID]chan<- []byte)
57-
p.listeners[event] = eventListeners
200+
if eventQs, ok = p.queues[event]; !ok {
201+
eventQs = make(map[uuid.UUID]*msgQueue)
202+
p.queues[event] = eventQs
58203
}
59-
60-
ctx, cancelCallbacks := context.WithCancel(p.ctx)
61-
messages := make(chan []byte, messageBufferSize)
62-
go messagesToListener(ctx, messages, listener)
63204
id := uuid.New()
64-
eventListeners[id] = messages
205+
eventQs[id] = newQ
65206
return func() {
66207
p.mut.Lock()
67208
defer p.mut.Unlock()
68-
cancelCallbacks()
69-
listeners := p.listeners[event]
209+
listeners := p.queues[event]
210+
q := listeners[id]
211+
q.close()
70212
delete(listeners, id)
71213

72214
if len(listeners) == 0 {
@@ -109,6 +251,7 @@ func (p *pgPubsub) listen(ctx context.Context) {
109251
}
110252
// A nil notification can be dispatched on reconnect.
111253
if notif == nil {
254+
p.recordReconnect()
112255
continue
113256
}
114257
p.listenReceive(notif)
@@ -118,19 +261,22 @@ func (p *pgPubsub) listen(ctx context.Context) {
118261
func (p *pgPubsub) listenReceive(notif *pq.Notification) {
119262
p.mut.Lock()
120263
defer p.mut.Unlock()
121-
listeners, ok := p.listeners[notif.Channel]
264+
queues, ok := p.queues[notif.Channel]
122265
if !ok {
123266
return
124267
}
125268
extra := []byte(notif.Extra)
126-
for _, listener := range listeners {
127-
select {
128-
case listener <- extra:
129-
// ok!
130-
default:
131-
// bad news, we dropped the event because the listener isn't
132-
// keeping up
133-
// TODO (spike): figure out a way to communicate this to the Listener
269+
for _, q := range queues {
270+
q.enqueue(extra)
271+
}
272+
}
273+
274+
func (p *pgPubsub) recordReconnect() {
275+
p.mut.Lock()
276+
defer p.mut.Unlock()
277+
for _, listeners := range p.queues {
278+
for _, q := range listeners {
279+
q.dropped()
134280
}
135281
}
136282
}
@@ -162,20 +308,9 @@ func NewPubsub(ctx context.Context, database *sql.DB, connectURL string) (Pubsub
162308
ctx: ctx,
163309
db: database,
164310
pgListener: listener,
165-
listeners: make(map[string]map[uuid.UUID]chan<- []byte),
311+
queues: make(map[string]map[uuid.UUID]*msgQueue),
166312
}
167313
go pgPubsub.listen(ctx)
168314

169315
return pgPubsub, nil
170316
}
171-
172-
func messagesToListener(ctx context.Context, messages <-chan []byte, listener Listener) {
173-
for {
174-
select {
175-
case <-ctx.Done():
176-
return
177-
case m := <-messages:
178-
listener(ctx, m)
179-
}
180-
}
181-
}

0 commit comments

Comments
 (0)