sliding-sync/sync2/poller_test.go

1422 lines
43 KiB
Go
Raw Normal View History

2021-06-10 11:42:28 +01:00
package sync2
import (
add extensions for typing and receipts; bugfixes and additional perf improvements Features: - Add `typing` extension. - Add `receipts` extension. - Add comprehensive prometheus `/metrics` activated via `SYNCV3_PROM`. - Add `SYNCV3_PPROF` support. - Add `by_notification_level` sort order. - Add `include_old_rooms` support. - Add support for `$ME` and `$LAZY`. - Add correct filtering when `*,*` is used as `required_state`. - Add `num_live` to each room response to indicate how many timeline entries are live. Bug fixes: - Use a stricter comparison function on ranges: fixes an issue whereby UTs fail on go1.19 due to change in sorting algorithm. - Send back an `errcode` on HTTP errors (e.g expired sessions). - Remove `unsigned.txn_id` on insertion into the DB. Otherwise other users would see other users txn IDs :( - Improve range delta algorithm: previously it didn't handle cases like `[0,20] -> [20,30]` and would panic. - Send HTTP 400 for invalid range requests. - Don't publish no-op unread counts which just adds extra noise. - Fix leaking DB connections which could eventually consume all available connections. - Ensure we always unblock WaitUntilInitialSync even on invalid access tokens. Other code relies on WaitUntilInitialSync() actually returning at _some_ point e.g on startup we have N workers which bound the number of concurrent pollers made at any one time, we need to not just hog a worker forever. Improvements: - Greatly improve startup times of sync3 handlers by improving `JoinedRoomsTracker`: a modest amount of data would take ~28s to create the handler, now it takes 4s. - Massively improve initial initial v3 sync times, by refactoring `JoinedRoomsTracker`, from ~47s to <1s. - Add `SlidingSyncUntil...` in tests to reduce races. - Tweak the API shape of JoinedUsersForRoom to reduce state block processing time for large rooms from 63s to 39s. - Add trace task for initial syncs. - Include the proxy version in UA strings. - HTTP errors now wait 1s before returning to stop clients tight-looping on error. - Pending event buffer is now 2000. - Index the room ID first to cull the most events when returning timeline entries. Speeds up `SelectLatestEventsBetween` by a factor of 8. - Remove cancelled `m.room_key_requests` from the to-device inbox. Cuts down the amount of events in the inbox by ~94% for very large (20k+) inboxes, ~50% for moderate sized (200 events) inboxes. Adds book-keeping to remember the unacked to-device position for each client.
2022-12-14 18:53:55 +00:00
"context"
2021-06-10 11:42:28 +01:00
"encoding/json"
"fmt"
2023-09-06 15:49:19 +01:00
"net/http"
"os"
2021-06-10 11:42:28 +01:00
"strconv"
"sync"
"testing"
"time"
"github.com/matrix-org/sliding-sync/internal"
"github.com/matrix-org/sliding-sync/testutils"
"github.com/rs/zerolog"
2021-06-10 11:42:28 +01:00
)
const initialSinceToken = "0"
2023-10-11 12:58:05 +01:00
// monkey patch out time.Since with a test controlled value.
// This is done in the init block so we can make sure we swap it out BEFORE any pollers
// start. If we wait until pollers exist, we get data races. This includes pollers in tests
// which don't use timeSince, hence the init block.
var (
timeSinceMu sync.Mutex
timeSinceValue = time.Duration(0) // 0 means use the real impl
timeSleepMu sync.Mutex
timeSleepValue = time.Duration(0) // 0 means use the real impl
timeSleepCheck func(time.Duration) // called to check sleep values
2023-10-11 12:58:05 +01:00
)
func setTimeSinceValue(val time.Duration) {
timeSinceMu.Lock()
defer timeSinceMu.Unlock()
2023-10-11 13:21:52 +01:00
timeSinceValue = val
}
func setTimeSleepDelay(val time.Duration, fn ...func(d time.Duration)) {
timeSleepMu.Lock()
defer timeSleepMu.Unlock()
timeSleepValue = val
if len(fn) > 0 {
timeSleepCheck = fn[0]
}
2023-10-11 12:58:05 +01:00
}
func init() {
timeSince = func(t time.Time) time.Duration {
timeSinceMu.Lock()
defer timeSinceMu.Unlock()
if timeSinceValue == 0 {
return time.Since(t)
}
return timeSinceValue
}
timeSleep = func(d time.Duration) {
timeSleepMu.Lock()
defer timeSleepMu.Unlock()
if timeSleepCheck != nil {
timeSleepCheck(d)
}
if timeSleepValue == 0 {
time.Sleep(d)
return
}
time.Sleep(timeSleepValue)
}
2023-10-11 12:58:05 +01:00
}
add extensions for typing and receipts; bugfixes and additional perf improvements Features: - Add `typing` extension. - Add `receipts` extension. - Add comprehensive prometheus `/metrics` activated via `SYNCV3_PROM`. - Add `SYNCV3_PPROF` support. - Add `by_notification_level` sort order. - Add `include_old_rooms` support. - Add support for `$ME` and `$LAZY`. - Add correct filtering when `*,*` is used as `required_state`. - Add `num_live` to each room response to indicate how many timeline entries are live. Bug fixes: - Use a stricter comparison function on ranges: fixes an issue whereby UTs fail on go1.19 due to change in sorting algorithm. - Send back an `errcode` on HTTP errors (e.g expired sessions). - Remove `unsigned.txn_id` on insertion into the DB. Otherwise other users would see other users txn IDs :( - Improve range delta algorithm: previously it didn't handle cases like `[0,20] -> [20,30]` and would panic. - Send HTTP 400 for invalid range requests. - Don't publish no-op unread counts which just adds extra noise. - Fix leaking DB connections which could eventually consume all available connections. - Ensure we always unblock WaitUntilInitialSync even on invalid access tokens. Other code relies on WaitUntilInitialSync() actually returning at _some_ point e.g on startup we have N workers which bound the number of concurrent pollers made at any one time, we need to not just hog a worker forever. Improvements: - Greatly improve startup times of sync3 handlers by improving `JoinedRoomsTracker`: a modest amount of data would take ~28s to create the handler, now it takes 4s. - Massively improve initial initial v3 sync times, by refactoring `JoinedRoomsTracker`, from ~47s to <1s. - Add `SlidingSyncUntil...` in tests to reduce races. - Tweak the API shape of JoinedUsersForRoom to reduce state block processing time for large rooms from 63s to 39s. - Add trace task for initial syncs. - Include the proxy version in UA strings. - HTTP errors now wait 1s before returning to stop clients tight-looping on error. - Pending event buffer is now 2000. - Index the room ID first to cull the most events when returning timeline entries. Speeds up `SelectLatestEventsBetween` by a factor of 8. - Remove cancelled `m.room_key_requests` from the to-device inbox. Cuts down the amount of events in the inbox by ~94% for very large (20k+) inboxes, ~50% for moderate sized (200 events) inboxes. Adds book-keeping to remember the unacked to-device position for each client.
2022-12-14 18:53:55 +00:00
// Tests that EnsurePolling works in the happy case
func TestPollerMapEnsurePolling(t *testing.T) {
nextSince := "next"
roomID := "!foo:bar"
roomState := []json.RawMessage{
json.RawMessage(`{"event":1}`),
json.RawMessage(`{"event":2}`),
json.RawMessage(`{"event":3}`),
}
initialResponse := &SyncResponse{
NextBatch: nextSince,
Rooms: struct {
Join map[string]SyncV2JoinResponse `json:"join"`
Invite map[string]SyncV2InviteResponse `json:"invite"`
Leave map[string]SyncV2LeaveResponse `json:"leave"`
}{
Join: map[string]SyncV2JoinResponse{
roomID: {
State: EventsResponse{
Events: roomState,
},
},
},
},
}
syncRequests := make(chan string)
syncResponses := make(chan *SyncResponse)
accumulator, client := newMocks(func(authHeader, since string) (*SyncResponse, int, error) {
syncRequests <- since
return <-syncResponses, 200, nil
})
accumulator.incomingProcess = make(chan struct{})
accumulator.unblockProcess = make(chan struct{})
pm := NewPollerMap(client, false)
pm.SetCallbacks(accumulator)
ensurePollingUnblocked := make(chan struct{})
go func() {
2023-04-28 12:47:13 +01:00
pm.EnsurePolling(PollerID{
UserID: "alice:localhost",
DeviceID: "FOOBAR",
}, "access_token", "", false, zerolog.New(os.Stderr))
add extensions for typing and receipts; bugfixes and additional perf improvements Features: - Add `typing` extension. - Add `receipts` extension. - Add comprehensive prometheus `/metrics` activated via `SYNCV3_PROM`. - Add `SYNCV3_PPROF` support. - Add `by_notification_level` sort order. - Add `include_old_rooms` support. - Add support for `$ME` and `$LAZY`. - Add correct filtering when `*,*` is used as `required_state`. - Add `num_live` to each room response to indicate how many timeline entries are live. Bug fixes: - Use a stricter comparison function on ranges: fixes an issue whereby UTs fail on go1.19 due to change in sorting algorithm. - Send back an `errcode` on HTTP errors (e.g expired sessions). - Remove `unsigned.txn_id` on insertion into the DB. Otherwise other users would see other users txn IDs :( - Improve range delta algorithm: previously it didn't handle cases like `[0,20] -> [20,30]` and would panic. - Send HTTP 400 for invalid range requests. - Don't publish no-op unread counts which just adds extra noise. - Fix leaking DB connections which could eventually consume all available connections. - Ensure we always unblock WaitUntilInitialSync even on invalid access tokens. Other code relies on WaitUntilInitialSync() actually returning at _some_ point e.g on startup we have N workers which bound the number of concurrent pollers made at any one time, we need to not just hog a worker forever. Improvements: - Greatly improve startup times of sync3 handlers by improving `JoinedRoomsTracker`: a modest amount of data would take ~28s to create the handler, now it takes 4s. - Massively improve initial initial v3 sync times, by refactoring `JoinedRoomsTracker`, from ~47s to <1s. - Add `SlidingSyncUntil...` in tests to reduce races. - Tweak the API shape of JoinedUsersForRoom to reduce state block processing time for large rooms from 63s to 39s. - Add trace task for initial syncs. - Include the proxy version in UA strings. - HTTP errors now wait 1s before returning to stop clients tight-looping on error. - Pending event buffer is now 2000. - Index the room ID first to cull the most events when returning timeline entries. Speeds up `SelectLatestEventsBetween` by a factor of 8. - Remove cancelled `m.room_key_requests` from the to-device inbox. Cuts down the amount of events in the inbox by ~94% for very large (20k+) inboxes, ~50% for moderate sized (200 events) inboxes. Adds book-keeping to remember the unacked to-device position for each client.
2022-12-14 18:53:55 +00:00
close(ensurePollingUnblocked)
}()
ensureBlocking := func() {
select {
case <-ensurePollingUnblocked:
t.Fatalf("EnsurePolling unblocked")
default:
}
}
// wait until we get a /sync request
since := <-syncRequests
if since != "" {
t.Fatalf("/sync not made with empty since token, got %v", since)
}
// make sure we're still blocking
ensureBlocking()
// respond to the /sync request
syncResponses <- initialResponse
// make sure we're still blocking
ensureBlocking()
// wait until we are processing the state response
<-accumulator.incomingProcess
// make sure we're still blocking
ensureBlocking()
// finish processing
accumulator.unblockProcess <- struct{}{}
// make sure we unblock
select {
case <-ensurePollingUnblocked:
case <-time.After(time.Second):
t.Fatalf("EnsurePolling did not unblock after 1s")
}
}
func TestPollerMapEnsurePollingIdempotent(t *testing.T) {
nextSince := "next"
roomID := "!foo:bar"
roomState := []json.RawMessage{
json.RawMessage(`{"event":1}`),
json.RawMessage(`{"event":2}`),
json.RawMessage(`{"event":3}`),
}
initialResponse := &SyncResponse{
NextBatch: nextSince,
Rooms: struct {
Join map[string]SyncV2JoinResponse `json:"join"`
Invite map[string]SyncV2InviteResponse `json:"invite"`
Leave map[string]SyncV2LeaveResponse `json:"leave"`
}{
Join: map[string]SyncV2JoinResponse{
roomID: {
State: EventsResponse{
Events: roomState,
},
},
},
},
}
syncRequests := make(chan string)
syncResponses := make(chan *SyncResponse)
accumulator, client := newMocks(func(authHeader, since string) (*SyncResponse, int, error) {
syncRequests <- since
return <-syncResponses, 200, nil
})
accumulator.incomingProcess = make(chan struct{})
accumulator.unblockProcess = make(chan struct{})
pm := NewPollerMap(client, false)
pm.SetCallbacks(accumulator)
ensurePollingUnblocked := make(chan struct{})
var wg sync.WaitGroup
n := 3
wg.Add(n)
for i := 0; i < n; i++ {
go func() {
t.Logf("EnsurePolling")
2023-04-28 12:47:13 +01:00
pm.EnsurePolling(PollerID{UserID: "@alice:localhost", DeviceID: "FOOBAR"}, "access_token", "", false, zerolog.New(os.Stderr))
add extensions for typing and receipts; bugfixes and additional perf improvements Features: - Add `typing` extension. - Add `receipts` extension. - Add comprehensive prometheus `/metrics` activated via `SYNCV3_PROM`. - Add `SYNCV3_PPROF` support. - Add `by_notification_level` sort order. - Add `include_old_rooms` support. - Add support for `$ME` and `$LAZY`. - Add correct filtering when `*,*` is used as `required_state`. - Add `num_live` to each room response to indicate how many timeline entries are live. Bug fixes: - Use a stricter comparison function on ranges: fixes an issue whereby UTs fail on go1.19 due to change in sorting algorithm. - Send back an `errcode` on HTTP errors (e.g expired sessions). - Remove `unsigned.txn_id` on insertion into the DB. Otherwise other users would see other users txn IDs :( - Improve range delta algorithm: previously it didn't handle cases like `[0,20] -> [20,30]` and would panic. - Send HTTP 400 for invalid range requests. - Don't publish no-op unread counts which just adds extra noise. - Fix leaking DB connections which could eventually consume all available connections. - Ensure we always unblock WaitUntilInitialSync even on invalid access tokens. Other code relies on WaitUntilInitialSync() actually returning at _some_ point e.g on startup we have N workers which bound the number of concurrent pollers made at any one time, we need to not just hog a worker forever. Improvements: - Greatly improve startup times of sync3 handlers by improving `JoinedRoomsTracker`: a modest amount of data would take ~28s to create the handler, now it takes 4s. - Massively improve initial initial v3 sync times, by refactoring `JoinedRoomsTracker`, from ~47s to <1s. - Add `SlidingSyncUntil...` in tests to reduce races. - Tweak the API shape of JoinedUsersForRoom to reduce state block processing time for large rooms from 63s to 39s. - Add trace task for initial syncs. - Include the proxy version in UA strings. - HTTP errors now wait 1s before returning to stop clients tight-looping on error. - Pending event buffer is now 2000. - Index the room ID first to cull the most events when returning timeline entries. Speeds up `SelectLatestEventsBetween` by a factor of 8. - Remove cancelled `m.room_key_requests` from the to-device inbox. Cuts down the amount of events in the inbox by ~94% for very large (20k+) inboxes, ~50% for moderate sized (200 events) inboxes. Adds book-keeping to remember the unacked to-device position for each client.
2022-12-14 18:53:55 +00:00
wg.Done()
t.Logf("EnsurePolling unblocked")
}()
}
go func() {
wg.Wait()
close(ensurePollingUnblocked)
t.Logf("EnsurePolling all unblocked")
}()
ensureBlocking := func() {
select {
case <-ensurePollingUnblocked:
t.Fatalf("EnsurePolling unblocked")
default:
t.Logf("EnsurePolling still blocking")
}
}
// wait until we get a /sync request
since := <-syncRequests
if since != "" {
t.Fatalf("/sync not made with empty since token, got %v", since)
}
t.Logf("Recv /sync request")
// make sure we're still blocking
ensureBlocking()
// respond to the /sync request
syncResponses <- initialResponse
t.Logf("Responded to /sync request")
// make sure we're still blocking
ensureBlocking()
// wait until we are processing the state response
<-accumulator.incomingProcess
t.Logf("Processing response...")
// make sure we're still blocking
ensureBlocking()
// finish processing
accumulator.unblockProcess <- struct{}{}
t.Logf("Processed response.")
// make sure we unblock
select {
case <-ensurePollingUnblocked:
case <-time.After(time.Second):
t.Fatalf("EnsurePolling did not unblock after 1s")
}
t.Logf("EnsurePolling unblocked")
}
2023-09-06 15:49:19 +01:00
func TestPollerMapEnsurePollingFailsWithExpiredToken(t *testing.T) {
accumulator, client := newMocks(func(authHeader, since string) (*SyncResponse, int, error) {
t.Logf("Responding to token '%s' with 401 Unauthorized", authHeader)
return nil, http.StatusUnauthorized, fmt.Errorf("RUH ROH unrecognised token")
})
pm := NewPollerMap(client, false)
pm.SetCallbacks(accumulator)
created, err := pm.EnsurePolling(PollerID{}, "dummy_token", "", true, zerolog.New(os.Stderr))
if created {
t.Errorf("Expected created=false, got created=true")
}
if err == nil {
t.Errorf("Expected nonnil error, got nil")
}
}
2023-08-09 10:49:48 +01:00
func TestPollerMap_ExpirePollers(t *testing.T) {
receiver, client := newMocks(func(authHeader, since string) (*SyncResponse, int, error) {
r := SyncResponse{
NextBatch: "batchy-mc-batchface",
}
return &r, 200, nil
})
pm := NewPollerMap(client, false)
pm.SetCallbacks(receiver)
// Start 5 pollers.
pollerSpecs := []struct {
UserID string
DeviceID string
Token string
}{
{UserID: "alice", DeviceID: "a_device", Token: "a_token"},
{UserID: "bob", DeviceID: "b_device2", Token: "b_token1"},
{UserID: "bob", DeviceID: "b_device1", Token: "b_token2"},
{UserID: "chris", DeviceID: "phone", Token: "c_token"},
{UserID: "delia", DeviceID: "phone", Token: "d_token"},
}
for i, spec := range pollerSpecs {
created, err := pm.EnsurePolling(
2023-08-09 10:49:48 +01:00
PollerID{UserID: spec.UserID, DeviceID: spec.DeviceID},
spec.Token, "", true, logger,
)
if err != nil {
t.Errorf("EnsurePolling error for poller #%d (%v): %s", i, spec, err)
}
2023-08-09 10:49:48 +01:00
if !created {
t.Errorf("Poller #%d (%v) was not newly created", i, spec)
2023-08-09 10:49:48 +01:00
}
}
// Expire some of them. This tests that:
pm.ExpirePollers([]PollerID{
// - Easy mode: if you have one poller and ask it to be deleted, it is deleted.
{"alice", "a_device"},
// - If you have two devices and ask for one of their pollers to be expired,
// only that poller is terminated.
{"bob", "b_device1"},
// - If there is a device ID clash, only the specified user's poller is expired.
// I.e. Delia unaffected
{"chris", "phone"},
})
// Try to recreate each poller. EnsurePolling should only report having to create a
// poller for the pollers we asked to be deleted.
expectDeleted := []bool{
true,
false,
true,
true,
false,
}
for i, spec := range pollerSpecs {
created, err := pm.EnsurePolling(
2023-08-09 10:49:48 +01:00
PollerID{UserID: spec.UserID, DeviceID: spec.DeviceID},
spec.Token, "", true, logger,
)
if err != nil {
t.Errorf("EnsurePolling error for poller #%d (%v): %s", i, spec, err)
}
2023-08-09 10:49:48 +01:00
if created != expectDeleted[i] {
t.Errorf("Poller #%d (%v): created=%t, expected %t", i, spec, created, expectDeleted[i])
}
}
}
2021-06-10 11:42:28 +01:00
// Check that a call to Poll starts polling and accumulating, and terminates on 401s.
func TestPollerPollFromNothing(t *testing.T) {
nextSince := "next"
2023-04-28 12:47:13 +01:00
pid := PollerID{UserID: "@alice:localhost", DeviceID: "FOOBAR"}
2021-06-10 11:42:28 +01:00
roomID := "!foo:bar"
roomState := []json.RawMessage{
json.RawMessage(`{"event":1}`),
json.RawMessage(`{"event":2}`),
json.RawMessage(`{"event":3}`),
}
hasPolledSuccessfully := make(chan struct{})
accumulator, client := newMocks(func(authHeader, since string) (*SyncResponse, int, error) {
2021-06-10 11:42:28 +01:00
if since == "" {
var joinResp SyncV2JoinResponse
joinResp.State.Events = roomState
return &SyncResponse{
NextBatch: nextSince,
Rooms: struct {
Join map[string]SyncV2JoinResponse `json:"join"`
Invite map[string]SyncV2InviteResponse `json:"invite"`
Leave map[string]SyncV2LeaveResponse `json:"leave"`
}{
Join: map[string]SyncV2JoinResponse{
roomID: joinResp,
},
},
}, 200, nil
}
return nil, 401, fmt.Errorf("terminated")
})
var wg sync.WaitGroup
wg.Add(1)
2023-04-28 12:47:13 +01:00
poller := newPoller(pid, "Authorization: hello world", client, accumulator, zerolog.New(os.Stderr), false)
2021-06-10 11:42:28 +01:00
go func() {
defer wg.Done()
poller.Poll("")
}()
go func() {
poller.WaitUntilInitialSync()
close(hasPolledSuccessfully)
2021-06-10 11:42:28 +01:00
}()
wg.Wait()
select {
case <-hasPolledSuccessfully:
break
case <-time.After(time.Second):
t.Errorf("WaitUntilInitialSync failed to fire")
2021-06-10 11:42:28 +01:00
}
if len(accumulator.states[roomID]) != len(roomState) {
t.Errorf("did not accumulate initial state for room, got %d events want %d", len(accumulator.states[roomID]), len(roomState))
}
2023-04-28 12:47:13 +01:00
if accumulator.pollerIDToSince[pid] != nextSince {
t.Errorf("did not persist latest since token, got %s want %s", accumulator.pollerIDToSince[pid], nextSince)
2021-06-10 11:42:28 +01:00
}
}
// Check that a call to Poll starts polling with an existing since token and accumulates timeline entries
func TestPollerPollFromExisting(t *testing.T) {
2023-04-28 12:47:13 +01:00
pid := PollerID{UserID: "@alice:localhost", DeviceID: "FOOBAR"}
2021-06-10 11:42:28 +01:00
roomID := "!foo:bar"
since := initialSinceToken
2021-06-10 11:42:28 +01:00
roomTimelineResponses := [][]json.RawMessage{
{
json.RawMessage(`{"event":1}`),
json.RawMessage(`{"event":2}`),
json.RawMessage(`{"event":3}`),
json.RawMessage(`{"event":4}`),
},
{
json.RawMessage(`{"event":5}`),
json.RawMessage(`{"event":6}`),
json.RawMessage(`{"event":7}`),
},
{
json.RawMessage(`{"event":8}`),
json.RawMessage(`{"event":9}`),
},
{
json.RawMessage(`{"event":10}`),
},
}
toDeviceResponses := [][]json.RawMessage{
{}, {}, {}, {json.RawMessage(`{}`)},
}
hasPolledSuccessfully := make(chan struct{})
accumulator, client := newMocks(func(authHeader, since string) (*SyncResponse, int, error) {
2021-06-10 11:42:28 +01:00
if since == "" {
t.Errorf("called DoSyncV2 with an empty since token")
return nil, 401, fmt.Errorf("fail")
}
sinceInt, err := strconv.Atoi(since)
if err != nil {
t.Errorf("called DoSyncV2 with invalid since: %s", since)
return nil, 401, fmt.Errorf("fail")
}
if sinceInt >= len(roomTimelineResponses) {
return nil, 401, fmt.Errorf("terminated")
}
var joinResp SyncV2JoinResponse
joinResp.Timeline.Events = roomTimelineResponses[sinceInt]
return &SyncResponse{
// Add in dummy toDevice messages, so the poller actually persists the since token. (Which
// it only does for the first poll, after 1min (this test doesn't run that long) OR there are
// ToDevice messages in the response)
ToDevice: EventsResponse{Events: toDeviceResponses[sinceInt]},
2021-06-10 11:42:28 +01:00
NextBatch: fmt.Sprintf("%d", sinceInt+1),
Rooms: struct {
Join map[string]SyncV2JoinResponse `json:"join"`
Invite map[string]SyncV2InviteResponse `json:"invite"`
Leave map[string]SyncV2LeaveResponse `json:"leave"`
}{
Join: map[string]SyncV2JoinResponse{
roomID: joinResp,
},
},
}, 200, nil
})
var wg sync.WaitGroup
wg.Add(1)
2023-04-28 12:47:13 +01:00
poller := newPoller(pid, "Authorization: hello world", client, accumulator, zerolog.New(os.Stderr), false)
2021-06-10 11:42:28 +01:00
go func() {
defer wg.Done()
poller.Poll(since)
}()
go func() {
poller.WaitUntilInitialSync()
close(hasPolledSuccessfully)
2021-06-10 11:42:28 +01:00
}()
wg.Wait()
select {
case <-hasPolledSuccessfully:
break
case <-time.After(time.Second):
t.Errorf("WaitUntilInitialSync failed to fire")
2021-06-10 11:42:28 +01:00
}
if len(accumulator.timelines[roomID]) != 10 {
t.Errorf("did not accumulate timelines for room, got %d events want %d", len(accumulator.timelines[roomID]), 10)
}
wantSince := fmt.Sprintf("%d", len(roomTimelineResponses))
2023-04-28 12:47:13 +01:00
if accumulator.pollerIDToSince[pid] != wantSince {
t.Errorf("did not persist latest since token, got %s want %s", accumulator.pollerIDToSince[pid], wantSince)
2021-06-10 11:42:28 +01:00
}
}
// Check that the since token in the database
// 1. is updated if it is the first iteration of poll
// 2. is NOT updated for random events
// 3. is updated if the syncV2 response contains ToDevice messages
// 4. is updated if at least 1min has passed since we last stored a token
func TestPollerPollUpdateDeviceSincePeriodically(t *testing.T) {
pid := PollerID{UserID: "@alice:localhost", DeviceID: "FOOBAR"}
syncResponses := make(chan *SyncResponse, 1)
syncCalledWithSince := make(chan string)
accumulator, client := newMocks(func(authHeader, since string) (*SyncResponse, int, error) {
if since != "" {
syncCalledWithSince <- since
}
return <-syncResponses, 200, nil
})
accumulator.updateSinceCalled = make(chan struct{}, 1)
poller := newPoller(pid, "Authorization: hello world", client, accumulator, zerolog.New(os.Stderr), false)
defer poller.Terminate()
go func() {
poller.Poll(initialSinceToken)
}()
hasPolledSuccessfully := make(chan struct{})
go func() {
poller.WaitUntilInitialSync()
close(hasPolledSuccessfully)
}()
// 1. Initial poll updates the database
next := "1"
syncResponses <- &SyncResponse{NextBatch: next}
mustEqualSince(t, <-syncCalledWithSince, initialSinceToken)
select {
case <-hasPolledSuccessfully:
break
case <-time.After(time.Second):
t.Errorf("WaitUntilInitialSync failed to fire")
}
// Also check that UpdateDeviceSince was called
select {
case <-accumulator.updateSinceCalled:
case <-time.After(time.Millisecond * 100): // give the Poller some time to process the response
t.Fatalf("did not receive call to UpdateDeviceSince in time")
}
if got := accumulator.pollerIDToSince[pid]; got != next {
t.Fatalf("expected since to be updated to %s, but got %s", next, got)
}
// The since token used by calls to doSyncV2
wantSinceFromSync := next
// 2. Second request updates the state but NOT the database
syncResponses <- &SyncResponse{NextBatch: "2"}
mustEqualSince(t, <-syncCalledWithSince, wantSinceFromSync)
select {
case <-accumulator.updateSinceCalled:
t.Fatalf("unexpected call to UpdateDeviceSince")
case <-time.After(time.Millisecond * 100):
}
if got := accumulator.pollerIDToSince[pid]; got != next {
t.Fatalf("expected since to be updated to %s, but got %s", next, got)
}
// 3. Sync response contains a toDevice message and should be stored in the database
wantSinceFromSync = "2"
next = "3"
syncResponses <- &SyncResponse{
NextBatch: next,
ToDevice: EventsResponse{Events: []json.RawMessage{{}}},
}
mustEqualSince(t, <-syncCalledWithSince, wantSinceFromSync)
select {
case <-accumulator.updateSinceCalled:
case <-time.After(time.Millisecond * 100):
t.Fatalf("did not receive call to UpdateDeviceSince in time")
}
if got := accumulator.pollerIDToSince[pid]; got != next {
t.Fatalf("expected since to be updated to %s, but got %s", wantSinceFromSync, got)
}
wantSinceFromSync = next
// 4. ... some time has passed, this triggers the 1min limit
2023-10-11 12:58:05 +01:00
setTimeSinceValue(time.Minute * 2)
defer setTimeSinceValue(0) // reset
next = "10"
syncResponses <- &SyncResponse{NextBatch: next}
mustEqualSince(t, <-syncCalledWithSince, wantSinceFromSync)
select {
case <-accumulator.updateSinceCalled:
case <-time.After(time.Millisecond * 100):
t.Fatalf("did not receive call to UpdateDeviceSince in time")
}
if got := accumulator.pollerIDToSince[pid]; got != next {
t.Fatalf("expected since to be updated to %s, but got %s", wantSinceFromSync, got)
}
}
func mustEqualSince(t *testing.T, gotSince, expectedSince string) {
t.Helper()
if gotSince != expectedSince {
t.Fatalf("client.DoSyncV2 using unexpected since token: %s, want %s", gotSince, expectedSince)
}
}
func TestPollerGivesUpEventually(t *testing.T) {
deviceID := "FOOBAR"
hasPolledSuccessfully := make(chan struct{})
accumulator, client := newMocks(func(authHeader, since string) (*SyncResponse, int, error) {
return nil, 524, fmt.Errorf("gateway timeout")
})
// actually sleep to make sure async actions can happen if any
setTimeSleepDelay(time.Microsecond)
defer func() { // reset the value after the test runs
setTimeSleepDelay(0)
}()
var wg sync.WaitGroup
wg.Add(1)
poller := newPoller(PollerID{UserID: "@alice:localhost", DeviceID: deviceID}, "Authorization: hello world", client, accumulator, zerolog.New(os.Stderr), false)
go func() {
defer wg.Done()
poller.Poll("")
}()
go func() {
poller.WaitUntilInitialSync()
close(hasPolledSuccessfully)
}()
wg.Wait()
select {
case <-hasPolledSuccessfully:
case <-time.After(100 * time.Millisecond):
break
}
// poller should be in the terminated state
if !poller.terminated.Load() {
t.Errorf("poller was not terminated")
}
}
2021-06-10 11:42:28 +01:00
// Tests that the poller backs off in 2,4,8,etc second increments to a variety of errors
func TestPollerBackoff(t *testing.T) {
deviceID := "FOOBAR"
hasPolledSuccessfully := make(chan struct{})
2021-06-10 11:42:28 +01:00
errorResponses := []struct {
code int
backoff time.Duration
err error
}{
{
code: 0,
err: fmt.Errorf("network error"),
backoff: 3 * time.Second,
2021-06-10 11:42:28 +01:00
},
{
code: 500,
err: fmt.Errorf("internal server error"),
backoff: 3 * time.Second,
2021-06-10 11:42:28 +01:00
},
{
code: 502,
err: fmt.Errorf("bad gateway error"),
backoff: 3 * time.Second,
2021-06-10 11:42:28 +01:00
},
{
code: 404,
err: fmt.Errorf("not found"),
backoff: 3 * time.Second,
2021-06-10 11:42:28 +01:00
},
}
errorResponsesIndex := 0
var wantBackoffDuration time.Duration
accumulator, client := newMocks(func(authHeader, since string) (*SyncResponse, int, error) {
2021-06-10 11:42:28 +01:00
if errorResponsesIndex >= len(errorResponses) {
return nil, 401, fmt.Errorf("terminated")
}
i := errorResponsesIndex
errorResponsesIndex += 1
wantBackoffDuration = errorResponses[i].backoff
return nil, errorResponses[i].code, errorResponses[i].err
})
setTimeSleepDelay(time.Millisecond, func(d time.Duration) {
2021-06-10 11:42:28 +01:00
if d != wantBackoffDuration {
t.Errorf("time.Sleep called incorrectly: got %v want %v", d, wantBackoffDuration)
}
})
defer func() { // reset the value after the test runs
setTimeSleepDelay(0)
}()
2021-06-10 11:42:28 +01:00
var wg sync.WaitGroup
wg.Add(1)
2023-04-28 12:47:13 +01:00
poller := newPoller(PollerID{UserID: "@alice:localhost", DeviceID: deviceID}, "Authorization: hello world", client, accumulator, zerolog.New(os.Stderr), false)
2021-06-10 11:42:28 +01:00
go func() {
defer wg.Done()
poller.Poll("some_since_value")
}()
go func() {
poller.WaitUntilInitialSync()
close(hasPolledSuccessfully)
2021-06-10 11:42:28 +01:00
}()
wg.Wait()
select {
case <-hasPolledSuccessfully:
case <-time.After(100 * time.Millisecond):
break
2021-06-10 11:42:28 +01:00
}
if errorResponsesIndex != len(errorResponses) {
t.Errorf("did not call DoSyncV2 enough, got %d times, want %d", errorResponsesIndex+1, len(errorResponses))
}
}
add extensions for typing and receipts; bugfixes and additional perf improvements Features: - Add `typing` extension. - Add `receipts` extension. - Add comprehensive prometheus `/metrics` activated via `SYNCV3_PROM`. - Add `SYNCV3_PPROF` support. - Add `by_notification_level` sort order. - Add `include_old_rooms` support. - Add support for `$ME` and `$LAZY`. - Add correct filtering when `*,*` is used as `required_state`. - Add `num_live` to each room response to indicate how many timeline entries are live. Bug fixes: - Use a stricter comparison function on ranges: fixes an issue whereby UTs fail on go1.19 due to change in sorting algorithm. - Send back an `errcode` on HTTP errors (e.g expired sessions). - Remove `unsigned.txn_id` on insertion into the DB. Otherwise other users would see other users txn IDs :( - Improve range delta algorithm: previously it didn't handle cases like `[0,20] -> [20,30]` and would panic. - Send HTTP 400 for invalid range requests. - Don't publish no-op unread counts which just adds extra noise. - Fix leaking DB connections which could eventually consume all available connections. - Ensure we always unblock WaitUntilInitialSync even on invalid access tokens. Other code relies on WaitUntilInitialSync() actually returning at _some_ point e.g on startup we have N workers which bound the number of concurrent pollers made at any one time, we need to not just hog a worker forever. Improvements: - Greatly improve startup times of sync3 handlers by improving `JoinedRoomsTracker`: a modest amount of data would take ~28s to create the handler, now it takes 4s. - Massively improve initial initial v3 sync times, by refactoring `JoinedRoomsTracker`, from ~47s to <1s. - Add `SlidingSyncUntil...` in tests to reduce races. - Tweak the API shape of JoinedUsersForRoom to reduce state block processing time for large rooms from 63s to 39s. - Add trace task for initial syncs. - Include the proxy version in UA strings. - HTTP errors now wait 1s before returning to stop clients tight-looping on error. - Pending event buffer is now 2000. - Index the room ID first to cull the most events when returning timeline entries. Speeds up `SelectLatestEventsBetween` by a factor of 8. - Remove cancelled `m.room_key_requests` from the to-device inbox. Cuts down the amount of events in the inbox by ~94% for very large (20k+) inboxes, ~50% for moderate sized (200 events) inboxes. Adds book-keeping to remember the unacked to-device position for each client.
2022-12-14 18:53:55 +00:00
// Regression test to make sure that if you start polling with an invalid token, we do end up unblocking WaitUntilInitialSync
// and don't end up blocking forever.
func TestPollerUnblocksIfTerminatedInitially(t *testing.T) {
deviceID := "FOOBAR"
accumulator, client := newMocks(func(authHeader, since string) (*SyncResponse, int, error) {
return nil, 401, fmt.Errorf("terminated")
})
pollUnblocked := make(chan struct{})
waitUntilInitialSyncUnblocked := make(chan struct{})
2023-04-28 12:47:13 +01:00
poller := newPoller(PollerID{UserID: "@alice:localhost", DeviceID: deviceID}, "Authorization: hello world", client, accumulator, zerolog.New(os.Stderr), false)
add extensions for typing and receipts; bugfixes and additional perf improvements Features: - Add `typing` extension. - Add `receipts` extension. - Add comprehensive prometheus `/metrics` activated via `SYNCV3_PROM`. - Add `SYNCV3_PPROF` support. - Add `by_notification_level` sort order. - Add `include_old_rooms` support. - Add support for `$ME` and `$LAZY`. - Add correct filtering when `*,*` is used as `required_state`. - Add `num_live` to each room response to indicate how many timeline entries are live. Bug fixes: - Use a stricter comparison function on ranges: fixes an issue whereby UTs fail on go1.19 due to change in sorting algorithm. - Send back an `errcode` on HTTP errors (e.g expired sessions). - Remove `unsigned.txn_id` on insertion into the DB. Otherwise other users would see other users txn IDs :( - Improve range delta algorithm: previously it didn't handle cases like `[0,20] -> [20,30]` and would panic. - Send HTTP 400 for invalid range requests. - Don't publish no-op unread counts which just adds extra noise. - Fix leaking DB connections which could eventually consume all available connections. - Ensure we always unblock WaitUntilInitialSync even on invalid access tokens. Other code relies on WaitUntilInitialSync() actually returning at _some_ point e.g on startup we have N workers which bound the number of concurrent pollers made at any one time, we need to not just hog a worker forever. Improvements: - Greatly improve startup times of sync3 handlers by improving `JoinedRoomsTracker`: a modest amount of data would take ~28s to create the handler, now it takes 4s. - Massively improve initial initial v3 sync times, by refactoring `JoinedRoomsTracker`, from ~47s to <1s. - Add `SlidingSyncUntil...` in tests to reduce races. - Tweak the API shape of JoinedUsersForRoom to reduce state block processing time for large rooms from 63s to 39s. - Add trace task for initial syncs. - Include the proxy version in UA strings. - HTTP errors now wait 1s before returning to stop clients tight-looping on error. - Pending event buffer is now 2000. - Index the room ID first to cull the most events when returning timeline entries. Speeds up `SelectLatestEventsBetween` by a factor of 8. - Remove cancelled `m.room_key_requests` from the to-device inbox. Cuts down the amount of events in the inbox by ~94% for very large (20k+) inboxes, ~50% for moderate sized (200 events) inboxes. Adds book-keeping to remember the unacked to-device position for each client.
2022-12-14 18:53:55 +00:00
go func() {
poller.Poll("")
close(pollUnblocked)
}()
go func() {
poller.WaitUntilInitialSync()
close(waitUntilInitialSyncUnblocked)
}()
select {
case <-pollUnblocked:
break
case <-time.After(time.Second):
t.Errorf("Poll() did not unblock")
}
select {
case <-waitUntilInitialSyncUnblocked:
break
case <-time.After(time.Second):
t.Errorf("WaitUntilInitialSync() did not unblock")
}
}
// Test that the poller sends the same sync v2 request, without incrementing the since token,
// when an errorable callback returns an error.
func TestPollerResendsOnCallbackError(t *testing.T) {
pid := PollerID{UserID: "@TestPollerResendsOnCallbackError:localhost", DeviceID: "FOOBAR"}
defer func() { // reset the value after the test runs
setTimeSleepDelay(0)
}()
// we don't actually want to wait 3s between retries, so monkey patch it out
setTimeSleepDelay(time.Millisecond)
testCases := []struct {
name string
generateReceiver func() V2DataReceiver
syncResponse *SyncResponse
}{
{
name: "AddToDeviceMessages",
// generate a response which will trigger the right callback
syncResponse: &SyncResponse{
ToDevice: EventsResponse{
Events: []json.RawMessage{
[]byte(`{"type":"device","content":{"yep":true}}`),
},
},
},
// generate a receiver which errors for the right callback
generateReceiver: func() V2DataReceiver {
return &overrideDataReceiver{
addToDeviceMessages: func(ctx context.Context, userID, deviceID string, msgs []json.RawMessage) error {
return fmt.Errorf("addToDeviceMessages error")
},
}
},
},
{
name: "OnE2EEData,OTKCount",
// generate a response which will trigger the right callback
syncResponse: &SyncResponse{
DeviceListsOTKCount: map[string]int{
"foo": 5,
},
},
// generate a receiver which errors for the right callback
generateReceiver: func() V2DataReceiver {
return &overrideDataReceiver{
onE2EEData: func(ctx context.Context, userID, deviceID string, otkCounts map[string]int, fallbackKeyTypes []string, deviceListChanges map[string]int) error {
return fmt.Errorf("onE2EEData error")
},
}
},
},
{
name: "OnE2EEData,FallbackKeys",
// generate a response which will trigger the right callback
syncResponse: &SyncResponse{
DeviceUnusedFallbackKeyTypes: []string{"foo", "bar"},
},
// generate a receiver which errors for the right callback
generateReceiver: func() V2DataReceiver {
return &overrideDataReceiver{
onE2EEData: func(ctx context.Context, userID, deviceID string, otkCounts map[string]int, fallbackKeyTypes []string, deviceListChanges map[string]int) error {
return fmt.Errorf("onE2EEData error")
},
}
},
},
{
name: "OnE2EEData,DeviceListChanges",
// generate a response which will trigger the right callback
syncResponse: &SyncResponse{
DeviceLists: struct {
Changed []string `json:"changed,omitempty"`
Left []string `json:"left,omitempty"`
}{
Changed: []string{"alice"},
Left: []string{"bob"},
},
},
// generate a receiver which errors for the right callback
generateReceiver: func() V2DataReceiver {
return &overrideDataReceiver{
onE2EEData: func(ctx context.Context, userID, deviceID string, otkCounts map[string]int, fallbackKeyTypes []string, deviceListChanges map[string]int) error {
return fmt.Errorf("onE2EEData error")
},
}
},
},
{
name: "Initialise",
// generate a response which will trigger the right callback
syncResponse: &SyncResponse{
Rooms: SyncRoomsResponse{
Join: map[string]SyncV2JoinResponse{
"!foo:bar": {
State: EventsResponse{
Events: []json.RawMessage{
[]byte(`{"type":"m.room.create","state_key":"","content":{},"sender":"@alice:localhost","event_id":"$111"}`),
},
},
},
},
},
},
// generate a receiver which errors for the right callback
generateReceiver: func() V2DataReceiver {
return &overrideDataReceiver{
initialise: func(ctx context.Context, roomID string, state []json.RawMessage) error {
return fmt.Errorf("initialise error")
},
}
},
},
{
name: "Accumulate",
// generate a response which will trigger the right callback
syncResponse: &SyncResponse{
Rooms: SyncRoomsResponse{
Join: map[string]SyncV2JoinResponse{
"!foo:bar": {
Timeline: TimelineResponse{
Events: []json.RawMessage{
[]byte(`{"type":"m.room.message","content":{},"sender":"@alice:localhost","event_id":"$222"}`),
},
},
},
},
},
},
// generate a receiver which errors for the right callback
generateReceiver: func() V2DataReceiver {
return &overrideDataReceiver{
accumulate: func(ctx context.Context, userID, deviceID, roomID, prevBatch string, timeline []json.RawMessage) error {
return fmt.Errorf("accumulate error")
},
}
},
},
{
name: "OnAccountData,Global",
// generate a response which will trigger the right callback
syncResponse: &SyncResponse{
AccountData: EventsResponse{
Events: []json.RawMessage{
[]byte(`{"type":"foo","content":{"bar":53}}`),
},
},
},
// generate a receiver which errors for the right callback
generateReceiver: func() V2DataReceiver {
return &overrideDataReceiver{
onAccountData: func(ctx context.Context, userID, roomID string, events []json.RawMessage) error {
return fmt.Errorf("onAccountData error")
},
}
},
},
{
name: "OnAccountData,Room",
// generate a response which will trigger the right callback
syncResponse: &SyncResponse{
Rooms: SyncRoomsResponse{
Join: map[string]SyncV2JoinResponse{
"!foo:bar": {
AccountData: EventsResponse{
Events: []json.RawMessage{
[]byte(`{"type":"foo_room","content":{"bar":53}}`),
},
},
},
},
},
},
// generate a receiver which errors for the right callback
generateReceiver: func() V2DataReceiver {
return &overrideDataReceiver{
onAccountData: func(ctx context.Context, userID, roomID string, events []json.RawMessage) error {
return fmt.Errorf("onAccountData error")
},
}
},
},
{
name: "OnInvite",
// generate a response which will trigger the right callback
syncResponse: &SyncResponse{
Rooms: SyncRoomsResponse{
Invite: map[string]SyncV2InviteResponse{
"!foo:bar": {
InviteState: EventsResponse{
Events: []json.RawMessage{
[]byte(`{"type":"foo_room","content":{"bar":53}}`),
},
},
},
},
},
},
// generate a receiver which errors for the right callback
generateReceiver: func() V2DataReceiver {
return &overrideDataReceiver{
onInvite: func(ctx context.Context, userID, roomID string, inviteState []json.RawMessage) error {
return fmt.Errorf("onInvite error")
},
}
},
},
{
name: "OnLeftRoom",
// generate a response which will trigger the right callback
syncResponse: &SyncResponse{
Rooms: SyncRoomsResponse{
Leave: map[string]SyncV2LeaveResponse{
"!foo:bar": {
Timeline: TimelineResponse{
Events: []json.RawMessage{
[]byte(`{"type":"m.room.member","state_key":"` + pid.UserID + `","content":{"membership":"leave"}}`),
},
},
},
},
},
},
// generate a receiver which errors for the right callback
generateReceiver: func() V2DataReceiver {
return &overrideDataReceiver{
onLeftRoom: func(ctx context.Context, userID, roomID string, leaveEvent json.RawMessage) error {
return fmt.Errorf("onLeftRoom error")
},
}
},
},
}
for _, tc := range testCases {
tc := tc
// test that if you get an error NOT on an initial sync, we retry with the right since token
var numSyncLoops int
waitForStuckPolling := make(chan struct{})
client := &mockClient{
// We initially sync with `initialSinceToken`, then advance the client to "1".
// We then expect the client to get stuck with "1" and never send "2", because
// the error returned in the receiver is preventing the advancement.
fn: func(authHeader, since string) (*SyncResponse, int, error) {
t.Logf("%v DoSyncV2 since=%v", tc.name, since)
if since == initialSinceToken {
return &SyncResponse{
NextBatch: "1",
}, 200, nil
}
if since != "1" {
t.Errorf("bad since token, got %s", since)
close(waitForStuckPolling)
return nil, 0, fmt.Errorf("bad since token: %v", since)
}
numSyncLoops++
if numSyncLoops > 2 {
// we are clearly stuck hitting "1" over and over, which is good.
// close the channel to unblock this test.
t.Log("stuck in a loop: closing channel")
close(waitForStuckPolling)
}
// we should never sync with since="2" because this syncResponse is going to
// produce an error in the receiver
tc.syncResponse.NextBatch = "2"
// we expect the since token to be "1"
return tc.syncResponse, 200, nil
},
}
receiver := tc.generateReceiver()
poller := newPoller(pid, "Authorization: hello world", client, receiver, zerolog.New(os.Stderr), false)
waitForInitialSync(t, poller)
select {
case <-waitForStuckPolling:
poller.Terminate()
continue
case <-time.After(time.Second):
t.Fatalf("%s: timed out waiting for repeated polling", tc.name)
}
}
}
// The purpose of this test is to make sure *internal.DataError errors do NOT cause the since token
// to be retried.
func TestPollerDoesNotResendOnDataError(t *testing.T) {
pid := PollerID{UserID: "@TestPollerDoesNotResendOnDataError:localhost", DeviceID: "FOOBAR"}
// make a receiver which will return a DataError when Accumulate is called
receiver := &overrideDataReceiver{
accumulate: func(ctx context.Context, userID, deviceID, roomID, prevBatch string, timeline []json.RawMessage) error {
return internal.NewDataError("this is a test: %v", 42)
},
}
waitForSuccess := make(chan struct{})
lastSince := ""
client := &mockClient{
// Process the initial sync then send back a timeline for a room which will cause Accumulate to be called.
// This should return a DataError but the since token will still be advanced due to it being a DataError
// and not some other kind of error.
fn: func(authHeader, since string) (*SyncResponse, int, error) {
2023-08-16 15:08:50 +01:00
if since == "" {
// we should start with since=0, but can end up with since=""
// if we continue to advance beyond what we are testing.
return nil, 500, fmt.Errorf("test has ended")
}
t.Logf("DoSyncV2 since=%v, last=%v", since, lastSince)
if lastSince == since {
t.Errorf("since token was retried: got %v", since)
}
lastSince = since
switch since {
case initialSinceToken:
// skip over initial syncs
return &SyncResponse{
NextBatch: "1",
}, 200, nil
case "1":
// return a response which will trigger Accumulate code
return &SyncResponse{
NextBatch: "2",
Rooms: SyncRoomsResponse{
Join: map[string]SyncV2JoinResponse{
"!foo:bar": {
Timeline: TimelineResponse{
Events: []json.RawMessage{
[]byte(`{"type":"m.room.message","content":{},"sender":"@alice:localhost","event_id":"$222"}`),
},
},
},
},
},
}, 200, nil
case "2":
close(waitForSuccess)
return &SyncResponse{
NextBatch: "3",
}, 200, nil
}
return &SyncResponse{
NextBatch: "",
}, 200, nil
},
}
poller := newPoller(pid, "Authorization: hello world", client, receiver, zerolog.New(os.Stderr), false)
waitForInitialSync(t, poller)
select {
case <-waitForSuccess:
case <-time.After(time.Second):
t.Errorf("timed out waiting for repeated polling")
}
poller.Terminate()
}
// The purpose of this test is to make sure we don't incorrectly skip retrying when a v2 response has many errors,
// some of which are retriable and some of which are not.
func TestPollerResendsOnDataErrorWithOtherErrors(t *testing.T) {
pid := PollerID{UserID: "@TestPollerResendsOnDataErrorWithOtherErrors:localhost", DeviceID: "FOOBAR"}
dontRetryRoomID := "!dont-retry:localhost"
// make a receiver which will return a DataError when Accumulate is called
receiver := &overrideDataReceiver{
accumulate: func(ctx context.Context, userID, deviceID, roomID, prevBatch string, timeline []json.RawMessage) error {
if roomID == dontRetryRoomID {
return internal.NewDataError("accumulate this is a test: %v", 42)
}
return fmt.Errorf("accumulate retriable error")
},
onAccountData: func(ctx context.Context, userID, roomID string, events []json.RawMessage) error {
return fmt.Errorf("onAccountData retriable error")
},
onLeftRoom: func(ctx context.Context, userID, roomID string, leaveEvent json.RawMessage) error {
return internal.NewDataError("onLeftRoom this is a test: %v", 42)
},
}
poller := newPoller(pid, "Authorization: hello world", nil, receiver, zerolog.New(os.Stderr), false)
testCases := []struct {
name string
res SyncResponse
wantRetry bool
}{
{
name: "single unretriable error",
res: SyncResponse{
NextBatch: "2",
Rooms: SyncRoomsResponse{
Join: map[string]SyncV2JoinResponse{
dontRetryRoomID: {
Timeline: TimelineResponse{
Events: []json.RawMessage{
testutils.NewMessageEvent(t, pid.UserID, "Don't Retry Me!"),
},
},
},
},
},
},
wantRetry: false,
},
{
name: "single retriable error",
res: SyncResponse{
NextBatch: "2",
Rooms: SyncRoomsResponse{
Join: map[string]SyncV2JoinResponse{
"!retry:localhost": {
AccountData: EventsResponse{
Events: []json.RawMessage{
testutils.NewAccountData(t, "m.retry", map[string]interface{}{}),
},
},
},
},
},
},
wantRetry: true,
},
{
name: "1 retriable error, 1 unretriable error",
res: SyncResponse{
NextBatch: "2",
Rooms: SyncRoomsResponse{
Join: map[string]SyncV2JoinResponse{
"!retry:localhost": {
AccountData: EventsResponse{
Events: []json.RawMessage{
testutils.NewAccountData(t, "m.retry", map[string]interface{}{}),
},
},
},
dontRetryRoomID: {
Timeline: TimelineResponse{
Events: []json.RawMessage{
testutils.NewMessageEvent(t, pid.UserID, "Don't Retry Me!"),
},
},
},
},
},
},
wantRetry: true,
},
{
name: "2 unretriable errors",
res: SyncResponse{
NextBatch: "2",
Rooms: SyncRoomsResponse{
Join: map[string]SyncV2JoinResponse{
dontRetryRoomID: {
Timeline: TimelineResponse{
Events: []json.RawMessage{
testutils.NewMessageEvent(t, pid.UserID, "Don't Retry Me!"),
},
},
},
},
Leave: map[string]SyncV2LeaveResponse{
dontRetryRoomID: {
Timeline: TimelineResponse{
Events: []json.RawMessage{
testutils.NewMessageEvent(t, pid.UserID, "Don't Retry Me!"),
},
},
},
},
},
},
wantRetry: false,
},
{
// sanity to make sure the 2x unretriable are both independently unretriable
name: "another 1 unretriable error",
res: SyncResponse{
NextBatch: "2",
Rooms: SyncRoomsResponse{
Leave: map[string]SyncV2LeaveResponse{
dontRetryRoomID: {
Timeline: TimelineResponse{
Events: []json.RawMessage{
testutils.NewMessageEvent(t, pid.UserID, "Don't Retry Me!"),
},
},
},
},
},
},
wantRetry: false,
},
}
// rather than set up the entire loop and machinery, just directly call parseRoomsResponse with various failure modes
for _, tc := range testCases {
err := poller.parseRoomsResponse(context.Background(), &tc.res)
if err == nil {
t.Errorf("%s: got no error", tc.name)
continue
}
t.Logf(tc.name, err)
got := shouldRetry(err)
if got != tc.wantRetry {
t.Errorf("%s: got retry %v want %v", tc.name, got, tc.wantRetry)
}
}
}
func waitForInitialSync(t *testing.T, poller *poller) {
go func() {
poller.Poll(initialSinceToken)
}()
hasPolledSuccessfully := make(chan struct{})
go func() {
poller.WaitUntilInitialSync()
close(hasPolledSuccessfully)
}()
select {
case <-hasPolledSuccessfully:
return
case <-time.After(time.Second):
t.Fatalf("timed out waiting for WaitUntilInitialSync to return")
}
}
2021-06-10 11:42:28 +01:00
type mockClient struct {
fn func(authHeader, since string) (*SyncResponse, int, error)
}
func (c *mockClient) Versions(ctx context.Context) ([]string, error) {
return []string{"v1.1"}, nil
}
func (c *mockClient) DoSyncV2(ctx context.Context, authHeader, since string, isFirst, toDeviceOnly bool) (*SyncResponse, int, error) {
2021-06-10 11:42:28 +01:00
return c.fn(authHeader, since)
}
func (c *mockClient) WhoAmI(ctx context.Context, authHeader string) (string, string, error) {
return "@alice:localhost", "device_123", nil
}
2021-06-10 11:42:28 +01:00
type mockDataReceiver struct {
*overrideDataReceiver
2023-08-16 14:00:40 +01:00
mu *sync.Mutex
states map[string][]json.RawMessage
timelines map[string][]json.RawMessage
pollerIDToSince map[PollerID]string
incomingProcess chan struct{}
unblockProcess chan struct{}
updateSinceCalled chan struct{}
2021-06-10 11:42:28 +01:00
}
2023-09-19 12:41:25 +01:00
func (a *mockDataReceiver) Accumulate(ctx context.Context, userID, deviceID, roomID string, timeline TimelineResponse) error {
2023-09-13 12:46:57 +01:00
a.timelines[roomID] = append(a.timelines[roomID], timeline.Events...)
return nil
2021-06-10 11:42:28 +01:00
}
func (a *mockDataReceiver) Initialise(ctx context.Context, roomID string, state []json.RawMessage) error {
2021-06-10 11:42:28 +01:00
a.states[roomID] = state
add extensions for typing and receipts; bugfixes and additional perf improvements Features: - Add `typing` extension. - Add `receipts` extension. - Add comprehensive prometheus `/metrics` activated via `SYNCV3_PROM`. - Add `SYNCV3_PPROF` support. - Add `by_notification_level` sort order. - Add `include_old_rooms` support. - Add support for `$ME` and `$LAZY`. - Add correct filtering when `*,*` is used as `required_state`. - Add `num_live` to each room response to indicate how many timeline entries are live. Bug fixes: - Use a stricter comparison function on ranges: fixes an issue whereby UTs fail on go1.19 due to change in sorting algorithm. - Send back an `errcode` on HTTP errors (e.g expired sessions). - Remove `unsigned.txn_id` on insertion into the DB. Otherwise other users would see other users txn IDs :( - Improve range delta algorithm: previously it didn't handle cases like `[0,20] -> [20,30]` and would panic. - Send HTTP 400 for invalid range requests. - Don't publish no-op unread counts which just adds extra noise. - Fix leaking DB connections which could eventually consume all available connections. - Ensure we always unblock WaitUntilInitialSync even on invalid access tokens. Other code relies on WaitUntilInitialSync() actually returning at _some_ point e.g on startup we have N workers which bound the number of concurrent pollers made at any one time, we need to not just hog a worker forever. Improvements: - Greatly improve startup times of sync3 handlers by improving `JoinedRoomsTracker`: a modest amount of data would take ~28s to create the handler, now it takes 4s. - Massively improve initial initial v3 sync times, by refactoring `JoinedRoomsTracker`, from ~47s to <1s. - Add `SlidingSyncUntil...` in tests to reduce races. - Tweak the API shape of JoinedUsersForRoom to reduce state block processing time for large rooms from 63s to 39s. - Add trace task for initial syncs. - Include the proxy version in UA strings. - HTTP errors now wait 1s before returning to stop clients tight-looping on error. - Pending event buffer is now 2000. - Index the room ID first to cull the most events when returning timeline entries. Speeds up `SelectLatestEventsBetween` by a factor of 8. - Remove cancelled `m.room_key_requests` from the to-device inbox. Cuts down the amount of events in the inbox by ~94% for very large (20k+) inboxes, ~50% for moderate sized (200 events) inboxes. Adds book-keeping to remember the unacked to-device position for each client.
2022-12-14 18:53:55 +00:00
if a.incomingProcess != nil {
a.incomingProcess <- struct{}{}
}
if a.unblockProcess != nil {
<-a.unblockProcess
}
2023-04-18 15:16:42 +01:00
// The return value is a list of unknown state events to be prepended to the room
// timeline. Untested here---return nil for now.
return nil
2021-06-10 16:51:56 +01:00
}
func (s *mockDataReceiver) UpdateDeviceSince(ctx context.Context, userID, deviceID, since string) {
2023-08-16 14:00:40 +01:00
s.mu.Lock()
defer s.mu.Unlock()
2023-04-28 12:47:13 +01:00
s.pollerIDToSince[PollerID{UserID: userID, DeviceID: deviceID}] = since
if s.updateSinceCalled != nil {
s.updateSinceCalled <- struct{}{}
}
2021-06-10 11:42:28 +01:00
}
type overrideDataReceiver struct {
accumulate func(ctx context.Context, userID, deviceID, roomID, prevBatch string, timeline []json.RawMessage) error
initialise func(ctx context.Context, roomID string, state []json.RawMessage) error
setTyping func(ctx context.Context, pollerID PollerID, roomID string, ephEvent json.RawMessage)
updateDeviceSince func(ctx context.Context, userID, deviceID, since string)
addToDeviceMessages func(ctx context.Context, userID, deviceID string, msgs []json.RawMessage) error
updateUnreadCounts func(ctx context.Context, roomID, userID string, highlightCount, notifCount *int)
onAccountData func(ctx context.Context, userID, roomID string, events []json.RawMessage) error
onReceipt func(ctx context.Context, userID, roomID, ephEventType string, ephEvent json.RawMessage)
onInvite func(ctx context.Context, userID, roomID string, inviteState []json.RawMessage) error
onLeftRoom func(ctx context.Context, userID, roomID string, leaveEvent json.RawMessage) error
onE2EEData func(ctx context.Context, userID, deviceID string, otkCounts map[string]int, fallbackKeyTypes []string, deviceListChanges map[string]int) error
onTerminated func(ctx context.Context, pollerID PollerID)
onExpiredToken func(ctx context.Context, accessTokenHash, userID, deviceID string)
}
2021-06-10 11:42:28 +01:00
2023-09-19 12:41:25 +01:00
func (s *overrideDataReceiver) Accumulate(ctx context.Context, userID, deviceID, roomID string, timeline TimelineResponse) error {
if s.accumulate == nil {
return nil
}
2023-09-13 12:46:57 +01:00
return s.accumulate(ctx, userID, deviceID, roomID, timeline.PrevBatch, timeline.Events)
}
func (s *overrideDataReceiver) Initialise(ctx context.Context, roomID string, state []json.RawMessage) error {
if s.initialise == nil {
return nil
}
return s.initialise(ctx, roomID, state)
}
func (s *overrideDataReceiver) SetTyping(ctx context.Context, pollerID PollerID, roomID string, ephEvent json.RawMessage) {
if s.setTyping == nil {
return
}
s.setTyping(ctx, pollerID, roomID, ephEvent)
}
func (s *overrideDataReceiver) UpdateDeviceSince(ctx context.Context, userID, deviceID, since string) {
if s.updateDeviceSince == nil {
return
}
s.updateDeviceSince(ctx, userID, deviceID, since)
}
func (s *overrideDataReceiver) AddToDeviceMessages(ctx context.Context, userID, deviceID string, msgs []json.RawMessage) error {
if s.addToDeviceMessages == nil {
return nil
}
return s.addToDeviceMessages(ctx, userID, deviceID, msgs)
}
func (s *overrideDataReceiver) UpdateUnreadCounts(ctx context.Context, roomID, userID string, highlightCount, notifCount *int) {
if s.updateUnreadCounts == nil {
return
}
s.updateUnreadCounts(ctx, roomID, userID, highlightCount, notifCount)
}
func (s *overrideDataReceiver) OnAccountData(ctx context.Context, userID, roomID string, events []json.RawMessage) error {
if s.onAccountData == nil {
return nil
}
return s.onAccountData(ctx, userID, roomID, events)
}
func (s *overrideDataReceiver) OnReceipt(ctx context.Context, userID, roomID, ephEventType string, ephEvent json.RawMessage) {
if s.onReceipt == nil {
return
}
s.onReceipt(ctx, userID, roomID, ephEventType, ephEvent)
}
func (s *overrideDataReceiver) OnInvite(ctx context.Context, userID, roomID string, inviteState []json.RawMessage) error {
if s.onInvite == nil {
return nil
}
return s.onInvite(ctx, userID, roomID, inviteState)
}
func (s *overrideDataReceiver) OnLeftRoom(ctx context.Context, userID, roomID string, leaveEvent json.RawMessage) error {
if s.onLeftRoom == nil {
return nil
}
return s.onLeftRoom(ctx, userID, roomID, leaveEvent)
}
func (s *overrideDataReceiver) OnE2EEData(ctx context.Context, userID, deviceID string, otkCounts map[string]int, fallbackKeyTypes []string, deviceListChanges map[string]int) error {
if s.onE2EEData == nil {
return nil
}
return s.onE2EEData(ctx, userID, deviceID, otkCounts, fallbackKeyTypes, deviceListChanges)
}
func (s *overrideDataReceiver) OnTerminated(ctx context.Context, pollerID PollerID) {
if s.onTerminated == nil {
return
}
s.onTerminated(ctx, pollerID)
}
func (s *overrideDataReceiver) OnExpiredToken(ctx context.Context, accessTokenHash, userID, deviceID string) {
if s.onExpiredToken == nil {
return
}
s.onExpiredToken(ctx, accessTokenHash, userID, deviceID)
add extensions for typing and receipts; bugfixes and additional perf improvements Features: - Add `typing` extension. - Add `receipts` extension. - Add comprehensive prometheus `/metrics` activated via `SYNCV3_PROM`. - Add `SYNCV3_PPROF` support. - Add `by_notification_level` sort order. - Add `include_old_rooms` support. - Add support for `$ME` and `$LAZY`. - Add correct filtering when `*,*` is used as `required_state`. - Add `num_live` to each room response to indicate how many timeline entries are live. Bug fixes: - Use a stricter comparison function on ranges: fixes an issue whereby UTs fail on go1.19 due to change in sorting algorithm. - Send back an `errcode` on HTTP errors (e.g expired sessions). - Remove `unsigned.txn_id` on insertion into the DB. Otherwise other users would see other users txn IDs :( - Improve range delta algorithm: previously it didn't handle cases like `[0,20] -> [20,30]` and would panic. - Send HTTP 400 for invalid range requests. - Don't publish no-op unread counts which just adds extra noise. - Fix leaking DB connections which could eventually consume all available connections. - Ensure we always unblock WaitUntilInitialSync even on invalid access tokens. Other code relies on WaitUntilInitialSync() actually returning at _some_ point e.g on startup we have N workers which bound the number of concurrent pollers made at any one time, we need to not just hog a worker forever. Improvements: - Greatly improve startup times of sync3 handlers by improving `JoinedRoomsTracker`: a modest amount of data would take ~28s to create the handler, now it takes 4s. - Massively improve initial initial v3 sync times, by refactoring `JoinedRoomsTracker`, from ~47s to <1s. - Add `SlidingSyncUntil...` in tests to reduce races. - Tweak the API shape of JoinedUsersForRoom to reduce state block processing time for large rooms from 63s to 39s. - Add trace task for initial syncs. - Include the proxy version in UA strings. - HTTP errors now wait 1s before returning to stop clients tight-looping on error. - Pending event buffer is now 2000. - Index the room ID first to cull the most events when returning timeline entries. Speeds up `SelectLatestEventsBetween` by a factor of 8. - Remove cancelled `m.room_key_requests` from the to-device inbox. Cuts down the amount of events in the inbox by ~94% for very large (20k+) inboxes, ~50% for moderate sized (200 events) inboxes. Adds book-keeping to remember the unacked to-device position for each client.
2022-12-14 18:53:55 +00:00
}
func newMocks(doSyncV2 func(authHeader, since string) (*SyncResponse, int, error)) (*mockDataReceiver, *mockClient) {
2021-06-10 11:42:28 +01:00
client := &mockClient{
fn: doSyncV2,
}
accumulator := &mockDataReceiver{
2023-08-15 19:11:21 +01:00
overrideDataReceiver: &overrideDataReceiver{},
2023-08-16 14:00:40 +01:00
mu: &sync.Mutex{},
2023-08-15 19:11:21 +01:00
states: make(map[string][]json.RawMessage),
timelines: make(map[string][]json.RawMessage),
pollerIDToSince: make(map[PollerID]string),
2021-06-10 11:42:28 +01:00
}
return accumulator, client
2021-06-10 11:42:28 +01:00
}