Use BinaryHeap for more efficient retry selection

This commit is contained in:
cetra3 2023-07-20 18:52:41 +09:30
parent ea2f6b4f69
commit b2e45f8287
4 changed files with 124 additions and 71 deletions

View file

@ -2,7 +2,7 @@
//!
#![doc = include_str!("../../docs/09_sending_activities.md")]
use self::{request::sign_and_send, retry_queue::RetryQueue};
use self::{queue::ActivityQueue, request::sign_and_send};
use crate::{
config::Data,
traits::{ActivityHandler, Actor},
@ -22,16 +22,15 @@ use std::{
use tracing::{debug, info, warn};
use url::Url;
pub(crate) mod queue;
pub(crate) mod request;
pub(crate) mod retry_queue;
pub(super) mod retry_worker;
pub(super) mod util;
/// Send a new activity to the given inboxes
///
/// - `activity`: The activity to be sent, gets converted to json
/// - `private_key`: Private key belonging to the actor who sends the activity, for signing HTTP
/// signature. Generated with [crate::http_signatures::generate_actor_keypair].
/// - `actor`: The actor doing the sending
/// - `inboxes`: List of remote actor inboxes that should receive the activity. Ignores local actor
/// inboxes. Should be built by calling [crate::traits::Actor::shared_inbox_or_inbox]
/// for each target actor.
@ -96,6 +95,17 @@ pub struct RawActivity {
private_key: PKey<Private>,
}
impl PartialEq for RawActivity {
fn eq(&self, other: &Self) -> bool {
self.actor_id == other.actor_id
&& self.activity_id == other.activity_id
&& self.activity == other.activity
&& self.inbox == other.inbox
}
}
impl Eq for RawActivity {}
impl RawActivity {
/// Sends a raw activity directly, rather than using the background queue.
/// This will sign and send the request using the configured [`client`](crate::config::FederationConfigBuilder::client) in the federation config
@ -187,8 +197,8 @@ pub(crate) fn create_activity_queue(
disable_retry: bool,
request_timeout: Duration,
http_signature_compat: bool,
) -> RetryQueue {
RetryQueue::new(
) -> ActivityQueue {
ActivityQueue::new(
client,
worker_count,
retry_count,
@ -264,7 +274,7 @@ mod tests {
.init();
*/
let activity_queue = RetryQueue::new(
let activity_queue = ActivityQueue::new(
reqwest::Client::default().into(),
num_workers,
num_workers,

View file

@ -15,7 +15,7 @@ use tokio::{sync::mpsc::UnboundedSender, task::JoinHandle};
/// A simple activity queue which spawns tokio workers to send out requests
/// Uses an unbounded mpsc queue for communication (i.e, all messages are in memory)
pub(crate) struct RetryQueue {
pub(crate) struct ActivityQueue {
// Stats shared between the queue and workers
stats: Arc<Stats>,
sender: UnboundedSender<RetryRawActivity>,
@ -48,7 +48,7 @@ impl Debug for Stats {
}
}
impl RetryQueue {
impl ActivityQueue {
pub fn new(
client: ClientWithMiddleware,
worker_count: usize,

View file

@ -1,23 +1,18 @@
use super::{request::sign_and_send, retry_queue::Stats, util::RetryStrategy, RawActivity};
use super::{queue::Stats, request::sign_and_send, util::RetryStrategy, RawActivity};
use futures_core::Future;
use futures_util::FutureExt;
use reqwest_middleware::ClientWithMiddleware;
use std::{
collections::{BTreeMap, BinaryHeap},
sync::{atomic::Ordering, Arc},
time::{Duration, Instant},
};
use tokio::{
sync::mpsc::{
error::TryRecvError,
unbounded_channel,
UnboundedReceiver,
UnboundedSender,
WeakUnboundedSender,
},
sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender, WeakUnboundedSender},
task::{JoinHandle, JoinSet},
time::MissedTickBehavior,
};
use tracing::error;
use tracing::{error, info};
/// A tokio spawned worker which is responsible for submitting requests to federated servers
/// This will retry up to one time with the same signature, and if it fails, will move it to the retry queue.
@ -37,7 +32,7 @@ pub(super) struct RetryWorker {
}
/// A message that has tried to be sent but has not been able to be sent
#[derive(Debug)]
#[derive(Debug, PartialEq, Eq)]
pub(super) struct RetryRawActivity {
/// The message that is sent
pub message: RawActivity,
@ -47,6 +42,20 @@ pub(super) struct RetryRawActivity {
pub count: usize,
}
// We reverse the order here as we want the "highest" to be the earliest, not latest
// So that we can retry the oldest sent first
impl Ord for RetryRawActivity {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.last_sent.cmp(&other.last_sent).reverse()
}
}
impl PartialOrd for RetryRawActivity {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl RetryWorker {
/// Spawns a background task for managing the queue of retryables
pub fn spawn(
@ -60,7 +69,7 @@ impl RetryWorker {
) -> (UnboundedSender<RetryRawActivity>, JoinHandle<()>) {
// The main sender channel, gets called immediately when something is queued
let (sender, receiver) = unbounded_channel::<RetryRawActivity>();
// The batch sender channel, waits up to an hour before checking if anything needs to be sent
// The batch sender channel, checks every hour if anything needs to be sent
let (batch_sender, batch_receiver) = unbounded_channel::<RetryRawActivity>();
// The retry sender channel, is called by the batch
let (retry_sender, retry_receiver) = unbounded_channel::<RetryRawActivity>();
@ -69,13 +78,11 @@ impl RetryWorker {
client,
timeout,
stats,
batch_sender: batch_sender.clone().downgrade(),
batch_sender: batch_sender.downgrade(),
backoff,
http_signature_compat,
});
let loop_batch_sender = batch_sender.clone().downgrade();
let retry_task = tokio::spawn(async move {
// This is the main worker queue, tasks sent here are sent immediately
let main_worker = worker.clone();
@ -90,12 +97,7 @@ impl RetryWorker {
if let Some(retry_count) = retry_count {
// This task checks every hour anything that needs to be sent, based upon the last sent time
// If any tasks need to be sent, they are then sent to the retry queue
let batch_loop = retry_loop(
backoff.pow(2),
batch_receiver,
loop_batch_sender,
retry_sender,
);
let batch_loop = retry_loop(backoff.pow(2), batch_receiver, retry_sender);
let retry_queue = receiver_queue(retry_count, retry_receiver, move |message| {
let worker = worker.clone();
@ -179,60 +181,101 @@ impl RetryWorker {
}
}
/// Ordered list of raw activities based upon retry count
///
/// Uses separate binary heaps per count to keep things in order
///
/// When flushed it will go through each queue and check to see if there are any retries ready to be sent
///
/// If enought time has elapsed it'll send them with the sender, otherwise they'll stay in the queue
struct RetryQueue {
/// Queue per retry count for ordering
queues: BTreeMap<usize, BinaryHeap<RetryRawActivity>>,
sender: UnboundedSender<RetryRawActivity>,
sleep_interval: usize,
}
impl RetryQueue {
/// Push a raw activity onto the queue
fn push(&mut self, retry: RetryRawActivity) {
let queue = self.queues.entry(retry.count).or_default();
queue.push(retry);
}
/// Flush out & send any retries that need to be retried
fn flush(&mut self) {
let mut count = 0;
let mut total = 0;
// We check each queue separately
for (retry_count, queue) in self.queues.iter_mut() {
// We check the duration based on the retry count using an exponential backoff, i.e, 60s, 60m, 60h
let sleep_duration =
Duration::from_secs(self.sleep_interval.pow(*retry_count as u32) as u64);
total += queue.len();
'queue: loop {
match queue.pop() {
Some(retry) => {
// If the elapsed time is long enough we send it
if retry.last_sent.elapsed() > sleep_duration {
if let Err(err) = self.sender.send(retry) {
error!("Error sending retry: {err}");
}
count += 1;
// If it's too young, then we exit the loop
// No more entries after this will be old enough in the binary heap
} else {
queue.push(retry);
break 'queue;
}
}
None => break 'queue,
}
}
}
if total > 0 {
info!("Scheduled {count}/{total} activities for retry");
}
}
}
/// This is a retry loop that will simply send tasks in batches
/// It will check an incoming queue, and schedule any tasks that need to be sent
/// The current sleep interval here is 1 hour
async fn retry_loop(
sleep_interval: usize,
mut batch_receiver: UnboundedReceiver<RetryRawActivity>,
batch_sender: WeakUnboundedSender<RetryRawActivity>,
retry_sender: UnboundedSender<RetryRawActivity>,
) {
let mut interval = tokio::time::interval(Duration::from_secs((sleep_interval) as u64));
interval.set_missed_tick_behavior(MissedTickBehavior::Delay);
let mut inner = RetryQueue {
queues: Default::default(),
sender: retry_sender,
sleep_interval,
};
loop {
interval.tick().await;
// We requeue any messages to be checked next time if they haven't slept long enough yet
let mut requeue_messages = Vec::new();
// Grab all the activities that are in the queue
loop {
// try_recv will not await anything
match batch_receiver.try_recv() {
Ok(message) => {
let sleep_duration = Duration::from_secs(
sleep_interval.pow(message.count as u32) as u64,
// Take off 1 second for tests to pass
) - Duration::from_secs(1);
// If the time between now and sending this message is greater than our sleep duration
if message.last_sent.elapsed() > sleep_duration {
if let Err(err) = retry_sender.send(message) {
error!("Couldn't wake up task for sending: {err}");
}
} else {
// If we haven't slept long enough, then we just add it to the end of the queue
requeue_messages.push(message);
tokio::select! {
message = batch_receiver.recv() => {
match message {
// We have a new message, add it to our queue
Some(retry) => {
inner.push(retry);
},
// The receiver has dropped, so flush out everything and then exit the loop
None => {
inner.flush();
break;
}
}
Err(TryRecvError::Empty) => {
// no more to be had, break and wait for the next interval
break;
}
Err(TryRecvError::Disconnected) => {
return;
}
}
}
// If there are any messages that need to be retried later on
if let Some(ref sender) = batch_sender.upgrade() {
for message in requeue_messages {
if let Err(err) = sender.send(message) {
error!("Couldn't wake up task for sending: {err}");
}
_ = interval.tick() => {
inner.flush();
}
}
}

View file

@ -16,7 +16,7 @@
//! ```
use crate::{
activity_queue::{create_activity_queue, retry_queue::RetryQueue},
activity_queue::{create_activity_queue, queue::ActivityQueue},
error::Error,
protocol::verification::verify_domains_match,
traits::{ActivityHandler, Actor},
@ -98,7 +98,7 @@ pub struct FederationConfig<T: Clone> {
/// Queue for sending outgoing activities. Only optional to make builder work, its always
/// present once constructed.
#[builder(setter(skip))]
pub(crate) activity_queue: Option<Arc<RetryQueue>>,
pub(crate) activity_queue: Option<Arc<ActivityQueue>>,
}
impl<T: Clone> FederationConfig<T> {
@ -199,7 +199,7 @@ impl<T: Clone> FederationConfig<T> {
.take()
.context("ActivityQueue never constructed, build() not called?")?;
// Todo: use Arc::into_inner but is only part of rust 1.70.
let stats = Arc::<RetryQueue>::try_unwrap(q)
let stats = Arc::<ActivityQueue>::try_unwrap(q)
.map_err(|_| {
anyhow::anyhow!(
"Could not cleanly shut down: activityqueue arc was still in use elsewhere "