sliding-sync/internal/context.go

186 lines
4.3 KiB
Go
Raw Permalink Normal View History

2022-08-16 14:23:05 +01:00
package internal
import (
"context"
2023-07-24 14:42:02 +01:00
"fmt"
"time"
2023-07-24 14:42:02 +01:00
2023-04-18 22:01:40 +01:00
"github.com/getsentry/sentry-go"
2022-08-16 14:23:05 +01:00
"github.com/rs/zerolog"
)
type ctx string
var (
ctxData ctx = "syncv3_data"
)
// logging metadata for a single request
type data struct {
2022-09-08 09:42:16 +01:00
userID string
2023-06-23 17:35:39 +01:00
deviceID string
2023-07-24 14:42:02 +01:00
bufferSummary string
2023-07-31 10:24:36 +01:00
connID string
2022-09-08 09:42:16 +01:00
since int64
next int64
numRooms int
txnID string
setupTime time.Duration
processingTime time.Duration
2022-09-08 09:42:16 +01:00
numToDeviceEvents int
numGlobalAccountData int
add extensions for typing and receipts; bugfixes and additional perf improvements Features: - Add `typing` extension. - Add `receipts` extension. - Add comprehensive prometheus `/metrics` activated via `SYNCV3_PROM`. - Add `SYNCV3_PPROF` support. - Add `by_notification_level` sort order. - Add `include_old_rooms` support. - Add support for `$ME` and `$LAZY`. - Add correct filtering when `*,*` is used as `required_state`. - Add `num_live` to each room response to indicate how many timeline entries are live. Bug fixes: - Use a stricter comparison function on ranges: fixes an issue whereby UTs fail on go1.19 due to change in sorting algorithm. - Send back an `errcode` on HTTP errors (e.g expired sessions). - Remove `unsigned.txn_id` on insertion into the DB. Otherwise other users would see other users txn IDs :( - Improve range delta algorithm: previously it didn't handle cases like `[0,20] -> [20,30]` and would panic. - Send HTTP 400 for invalid range requests. - Don't publish no-op unread counts which just adds extra noise. - Fix leaking DB connections which could eventually consume all available connections. - Ensure we always unblock WaitUntilInitialSync even on invalid access tokens. Other code relies on WaitUntilInitialSync() actually returning at _some_ point e.g on startup we have N workers which bound the number of concurrent pollers made at any one time, we need to not just hog a worker forever. Improvements: - Greatly improve startup times of sync3 handlers by improving `JoinedRoomsTracker`: a modest amount of data would take ~28s to create the handler, now it takes 4s. - Massively improve initial initial v3 sync times, by refactoring `JoinedRoomsTracker`, from ~47s to <1s. - Add `SlidingSyncUntil...` in tests to reduce races. - Tweak the API shape of JoinedUsersForRoom to reduce state block processing time for large rooms from 63s to 39s. - Add trace task for initial syncs. - Include the proxy version in UA strings. - HTTP errors now wait 1s before returning to stop clients tight-looping on error. - Pending event buffer is now 2000. - Index the room ID first to cull the most events when returning timeline entries. Speeds up `SelectLatestEventsBetween` by a factor of 8. - Remove cancelled `m.room_key_requests` from the to-device inbox. Cuts down the amount of events in the inbox by ~94% for very large (20k+) inboxes, ~50% for moderate sized (200 events) inboxes. Adds book-keeping to remember the unacked to-device position for each client.
2022-12-14 18:53:55 +00:00
numChangedDevices int
numLeftDevices int
2023-07-31 10:24:36 +01:00
numLists int
roomSubs int
roomUnsubs int
2022-08-16 14:23:05 +01:00
}
// prepare a request context so it can contain syncv3 info
func RequestContext(ctx context.Context) context.Context {
d := &data{
since: -1,
next: -1,
numRooms: -1,
}
return context.WithValue(ctx, ctxData, d)
}
// add the user ID to this request context. Need to have called RequestContext first.
func AssociateUserIDWithRequest(ctx context.Context, userID, deviceID string) context.Context {
2022-08-16 14:23:05 +01:00
d := ctx.Value(ctxData)
if d == nil {
return ctx
2022-08-16 14:23:05 +01:00
}
da := d.(*data)
da.userID = userID
2023-06-23 17:35:39 +01:00
da.deviceID = deviceID
hub := sentry.GetHubFromContext(ctx)
if hub == nil {
// Basing the sentry-wrangling on the sentry-go net/http integration, see e.g.
// https://github.com/getsentry/sentry-go/blob/02e712a638c40cd9701ad52d5d1309d65d556ef9/http/sentryhttp.go#L84
hub = sentry.CurrentHub().Clone()
2023-04-18 22:01:40 +01:00
}
hub.ConfigureScope(func(scope *sentry.Scope) {
scope.SetUser(sentry.User{Username: userID, ID: deviceID})
})
return sentry.SetHubOnContext(ctx, hub)
2022-08-16 14:23:05 +01:00
}
2023-07-24 14:42:02 +01:00
func SetConnBufferInfo(ctx context.Context, bufferLen, nextLen, bufferCap int) {
d := ctx.Value(ctxData)
if d == nil {
return
}
da := d.(*data)
da.bufferSummary = fmt.Sprintf("%d/%d/%d", bufferLen, nextLen, bufferCap)
}
add extensions for typing and receipts; bugfixes and additional perf improvements Features: - Add `typing` extension. - Add `receipts` extension. - Add comprehensive prometheus `/metrics` activated via `SYNCV3_PROM`. - Add `SYNCV3_PPROF` support. - Add `by_notification_level` sort order. - Add `include_old_rooms` support. - Add support for `$ME` and `$LAZY`. - Add correct filtering when `*,*` is used as `required_state`. - Add `num_live` to each room response to indicate how many timeline entries are live. Bug fixes: - Use a stricter comparison function on ranges: fixes an issue whereby UTs fail on go1.19 due to change in sorting algorithm. - Send back an `errcode` on HTTP errors (e.g expired sessions). - Remove `unsigned.txn_id` on insertion into the DB. Otherwise other users would see other users txn IDs :( - Improve range delta algorithm: previously it didn't handle cases like `[0,20] -> [20,30]` and would panic. - Send HTTP 400 for invalid range requests. - Don't publish no-op unread counts which just adds extra noise. - Fix leaking DB connections which could eventually consume all available connections. - Ensure we always unblock WaitUntilInitialSync even on invalid access tokens. Other code relies on WaitUntilInitialSync() actually returning at _some_ point e.g on startup we have N workers which bound the number of concurrent pollers made at any one time, we need to not just hog a worker forever. Improvements: - Greatly improve startup times of sync3 handlers by improving `JoinedRoomsTracker`: a modest amount of data would take ~28s to create the handler, now it takes 4s. - Massively improve initial initial v3 sync times, by refactoring `JoinedRoomsTracker`, from ~47s to <1s. - Add `SlidingSyncUntil...` in tests to reduce races. - Tweak the API shape of JoinedUsersForRoom to reduce state block processing time for large rooms from 63s to 39s. - Add trace task for initial syncs. - Include the proxy version in UA strings. - HTTP errors now wait 1s before returning to stop clients tight-looping on error. - Pending event buffer is now 2000. - Index the room ID first to cull the most events when returning timeline entries. Speeds up `SelectLatestEventsBetween` by a factor of 8. - Remove cancelled `m.room_key_requests` from the to-device inbox. Cuts down the amount of events in the inbox by ~94% for very large (20k+) inboxes, ~50% for moderate sized (200 events) inboxes. Adds book-keeping to remember the unacked to-device position for each client.
2022-12-14 18:53:55 +00:00
func SetRequestContextResponseInfo(
ctx context.Context, since, next int64, numRooms int, txnID string, numToDeviceEvents, numGlobalAccountData int,
2023-07-31 10:24:36 +01:00
numChangedDevices, numLeftDevices int, connID string, numLists int, roomSubs, roomUnsubs int,
add extensions for typing and receipts; bugfixes and additional perf improvements Features: - Add `typing` extension. - Add `receipts` extension. - Add comprehensive prometheus `/metrics` activated via `SYNCV3_PROM`. - Add `SYNCV3_PPROF` support. - Add `by_notification_level` sort order. - Add `include_old_rooms` support. - Add support for `$ME` and `$LAZY`. - Add correct filtering when `*,*` is used as `required_state`. - Add `num_live` to each room response to indicate how many timeline entries are live. Bug fixes: - Use a stricter comparison function on ranges: fixes an issue whereby UTs fail on go1.19 due to change in sorting algorithm. - Send back an `errcode` on HTTP errors (e.g expired sessions). - Remove `unsigned.txn_id` on insertion into the DB. Otherwise other users would see other users txn IDs :( - Improve range delta algorithm: previously it didn't handle cases like `[0,20] -> [20,30]` and would panic. - Send HTTP 400 for invalid range requests. - Don't publish no-op unread counts which just adds extra noise. - Fix leaking DB connections which could eventually consume all available connections. - Ensure we always unblock WaitUntilInitialSync even on invalid access tokens. Other code relies on WaitUntilInitialSync() actually returning at _some_ point e.g on startup we have N workers which bound the number of concurrent pollers made at any one time, we need to not just hog a worker forever. Improvements: - Greatly improve startup times of sync3 handlers by improving `JoinedRoomsTracker`: a modest amount of data would take ~28s to create the handler, now it takes 4s. - Massively improve initial initial v3 sync times, by refactoring `JoinedRoomsTracker`, from ~47s to <1s. - Add `SlidingSyncUntil...` in tests to reduce races. - Tweak the API shape of JoinedUsersForRoom to reduce state block processing time for large rooms from 63s to 39s. - Add trace task for initial syncs. - Include the proxy version in UA strings. - HTTP errors now wait 1s before returning to stop clients tight-looping on error. - Pending event buffer is now 2000. - Index the room ID first to cull the most events when returning timeline entries. Speeds up `SelectLatestEventsBetween` by a factor of 8. - Remove cancelled `m.room_key_requests` from the to-device inbox. Cuts down the amount of events in the inbox by ~94% for very large (20k+) inboxes, ~50% for moderate sized (200 events) inboxes. Adds book-keeping to remember the unacked to-device position for each client.
2022-12-14 18:53:55 +00:00
) {
2022-08-16 14:23:05 +01:00
d := ctx.Value(ctxData)
if d == nil {
return
}
da := d.(*data)
da.since = since
da.next = next
da.numRooms = numRooms
da.txnID = txnID
2022-09-08 09:42:16 +01:00
da.numToDeviceEvents = numToDeviceEvents
da.numGlobalAccountData = numGlobalAccountData
add extensions for typing and receipts; bugfixes and additional perf improvements Features: - Add `typing` extension. - Add `receipts` extension. - Add comprehensive prometheus `/metrics` activated via `SYNCV3_PROM`. - Add `SYNCV3_PPROF` support. - Add `by_notification_level` sort order. - Add `include_old_rooms` support. - Add support for `$ME` and `$LAZY`. - Add correct filtering when `*,*` is used as `required_state`. - Add `num_live` to each room response to indicate how many timeline entries are live. Bug fixes: - Use a stricter comparison function on ranges: fixes an issue whereby UTs fail on go1.19 due to change in sorting algorithm. - Send back an `errcode` on HTTP errors (e.g expired sessions). - Remove `unsigned.txn_id` on insertion into the DB. Otherwise other users would see other users txn IDs :( - Improve range delta algorithm: previously it didn't handle cases like `[0,20] -> [20,30]` and would panic. - Send HTTP 400 for invalid range requests. - Don't publish no-op unread counts which just adds extra noise. - Fix leaking DB connections which could eventually consume all available connections. - Ensure we always unblock WaitUntilInitialSync even on invalid access tokens. Other code relies on WaitUntilInitialSync() actually returning at _some_ point e.g on startup we have N workers which bound the number of concurrent pollers made at any one time, we need to not just hog a worker forever. Improvements: - Greatly improve startup times of sync3 handlers by improving `JoinedRoomsTracker`: a modest amount of data would take ~28s to create the handler, now it takes 4s. - Massively improve initial initial v3 sync times, by refactoring `JoinedRoomsTracker`, from ~47s to <1s. - Add `SlidingSyncUntil...` in tests to reduce races. - Tweak the API shape of JoinedUsersForRoom to reduce state block processing time for large rooms from 63s to 39s. - Add trace task for initial syncs. - Include the proxy version in UA strings. - HTTP errors now wait 1s before returning to stop clients tight-looping on error. - Pending event buffer is now 2000. - Index the room ID first to cull the most events when returning timeline entries. Speeds up `SelectLatestEventsBetween` by a factor of 8. - Remove cancelled `m.room_key_requests` from the to-device inbox. Cuts down the amount of events in the inbox by ~94% for very large (20k+) inboxes, ~50% for moderate sized (200 events) inboxes. Adds book-keeping to remember the unacked to-device position for each client.
2022-12-14 18:53:55 +00:00
da.numChangedDevices = numChangedDevices
da.numLeftDevices = numLeftDevices
2023-07-31 10:24:36 +01:00
da.connID = connID
da.numLists = numLists
da.roomSubs = roomSubs
da.roomUnsubs = roomUnsubs
2022-08-16 14:23:05 +01:00
}
func DecorateLogger(ctx context.Context, l *zerolog.Event) *zerolog.Event {
d := ctx.Value(ctxData)
if d == nil {
return l
}
da := d.(*data)
// NOTE: These log fields are documented in `docs/log_format.md`: remember to
// keep that document up-to-date.
2022-08-16 14:23:05 +01:00
if da.userID != "" {
l = l.Str("u", da.userID)
}
2023-06-23 17:35:39 +01:00
if da.deviceID != "" {
l = l.Str("dev", da.deviceID)
}
2022-08-16 14:23:05 +01:00
if da.since >= 0 {
l = l.Int64("p", da.since)
}
if da.next >= 0 {
l = l.Int64("q", da.next)
}
if da.txnID != "" {
l = l.Str("t", da.txnID)
}
if da.numRooms >= 0 {
l = l.Int("r", da.numRooms)
}
2022-09-08 09:42:16 +01:00
if da.numToDeviceEvents > 0 {
l = l.Int("d", da.numToDeviceEvents)
}
if da.numGlobalAccountData > 0 {
l = l.Int("ag", da.numGlobalAccountData)
}
add extensions for typing and receipts; bugfixes and additional perf improvements Features: - Add `typing` extension. - Add `receipts` extension. - Add comprehensive prometheus `/metrics` activated via `SYNCV3_PROM`. - Add `SYNCV3_PPROF` support. - Add `by_notification_level` sort order. - Add `include_old_rooms` support. - Add support for `$ME` and `$LAZY`. - Add correct filtering when `*,*` is used as `required_state`. - Add `num_live` to each room response to indicate how many timeline entries are live. Bug fixes: - Use a stricter comparison function on ranges: fixes an issue whereby UTs fail on go1.19 due to change in sorting algorithm. - Send back an `errcode` on HTTP errors (e.g expired sessions). - Remove `unsigned.txn_id` on insertion into the DB. Otherwise other users would see other users txn IDs :( - Improve range delta algorithm: previously it didn't handle cases like `[0,20] -> [20,30]` and would panic. - Send HTTP 400 for invalid range requests. - Don't publish no-op unread counts which just adds extra noise. - Fix leaking DB connections which could eventually consume all available connections. - Ensure we always unblock WaitUntilInitialSync even on invalid access tokens. Other code relies on WaitUntilInitialSync() actually returning at _some_ point e.g on startup we have N workers which bound the number of concurrent pollers made at any one time, we need to not just hog a worker forever. Improvements: - Greatly improve startup times of sync3 handlers by improving `JoinedRoomsTracker`: a modest amount of data would take ~28s to create the handler, now it takes 4s. - Massively improve initial initial v3 sync times, by refactoring `JoinedRoomsTracker`, from ~47s to <1s. - Add `SlidingSyncUntil...` in tests to reduce races. - Tweak the API shape of JoinedUsersForRoom to reduce state block processing time for large rooms from 63s to 39s. - Add trace task for initial syncs. - Include the proxy version in UA strings. - HTTP errors now wait 1s before returning to stop clients tight-looping on error. - Pending event buffer is now 2000. - Index the room ID first to cull the most events when returning timeline entries. Speeds up `SelectLatestEventsBetween` by a factor of 8. - Remove cancelled `m.room_key_requests` from the to-device inbox. Cuts down the amount of events in the inbox by ~94% for very large (20k+) inboxes, ~50% for moderate sized (200 events) inboxes. Adds book-keeping to remember the unacked to-device position for each client.
2022-12-14 18:53:55 +00:00
if da.numChangedDevices > 0 {
l = l.Int("dl-c", da.numChangedDevices)
}
if da.numLeftDevices > 0 {
l = l.Int("dl-l", da.numLeftDevices)
}
2023-07-24 14:42:02 +01:00
if da.bufferSummary != "" {
l = l.Str("b", da.bufferSummary)
}
2023-07-31 10:24:36 +01:00
if da.roomSubs > 0 {
l = l.Int("sub", da.roomSubs)
}
if da.roomUnsubs > 0 {
l = l.Int("usub", da.roomUnsubs)
}
if da.numLists > 0 {
l = l.Int("l", da.numLists)
}
// always log the connection ID so we know when it isn't set
l = l.Str("c", da.connID)
2022-08-16 14:23:05 +01:00
return l
}
func SetRequestContextSetupDuration(ctx context.Context, setup time.Duration) {
d := ctx.Value(ctxData)
if d == nil {
return
}
da := d.(*data)
da.setupTime = setup
}
func SetRequestContextProcessingDuration(ctx context.Context, processing time.Duration) {
d := ctx.Value(ctxData)
if d == nil {
return
}
da := d.(*data)
da.processingTime = processing
}
func RequestContextDurations(ctx context.Context) (setup time.Duration, processing time.Duration) {
d := ctx.Value(ctxData)
if d == nil {
return 0, 0
}
da := d.(*data)
return da.setupTime, da.processingTime
}