From 28324ad2c8e3c08f57fd3ae1af14750cf695371c Mon Sep 17 00:00:00 2001 From: Lukas Trombach Date: Wed, 23 Aug 2023 02:30:15 +1200 Subject: [PATCH] Sitemap (#3808) * generate sitemap.xml file * set up endpoint for sitemap * Update sitemap generation - remove sitemap generation from scheduled tasks - add posts query for sitemap - create sitemap module in API crate * remove priority and change freq from sitemap * add configuration option for number of posts for sitemap * fix default config * rate limit sitemap endpoint * update sitemap query * update sitemap generation - remove config value for query limit - adjust sitemap generation to query changes - tidy up error handling * refactor sitemap generation loop * remove `limit` argument * refactor `generate_urlset` and add unit test * change query to only fetch local posts of past 24h * fix outdated comment and log * cargo fmt --- Cargo.lock | 28 ++++++ crates/api/Cargo.toml | 3 + crates/api/src/lib.rs | 1 + crates/api/src/sitemap.rs | 142 +++++++++++++++++++++++++++++ crates/db_schema/src/impls/post.rs | 18 ++++ src/api_routes_http.rs | 6 ++ 6 files changed, 198 insertions(+) create mode 100644 crates/api/src/sitemap.rs diff --git a/Cargo.lock b/Cargo.lock index 98a890959..ba37fcc14 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1603,6 +1603,15 @@ version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797" +[[package]] +name = "elementtree" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3efd4742acf458718a6456e0adf0b4d734d6b783e452bbf1ac36bf31f4085cb3" +dependencies = [ + "string_cache", +] + [[package]] name = "email-encoding" version = "0.2.0" @@ -2581,6 +2590,7 @@ dependencies = [ "bcrypt", "captcha", "chrono", + "elementtree", "lemmy_api_common", "lemmy_db_schema", "lemmy_db_views", @@ -2589,8 +2599,10 @@ dependencies = [ "lemmy_utils", "serde", "serial_test", + "sitemap-rs", "tokio", "tracing", + "url", "uuid", "wav", ] @@ -4745,6 +4757,16 @@ version = "0.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" +[[package]] +name = "sitemap-rs" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95b58125f0ab4317b5ba3cdc1f60696e47958760e356874c759334fa56ae1596" +dependencies = [ + "chrono", + "xml-builder", +] + [[package]] name = "skeptic" version = "0.13.7" @@ -6132,6 +6154,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "xml-builder" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efc4f1a86af7800dfc4056c7833648ea4515ae21502060b5c98114d828f5333b" + [[package]] name = "xml5ever" version = "0.17.0" diff --git a/crates/api/Cargo.toml b/crates/api/Cargo.toml index 17f40d57d..f5b0e3924 100644 --- a/crates/api/Cargo.toml +++ b/crates/api/Cargo.toml @@ -31,8 +31,11 @@ captcha = { workspace = true } anyhow = { workspace = true } tracing = { workspace = true } chrono = { workspace = true } +url = { workspace = true } wav = "1.0.0" +sitemap-rs = "0.2.0" [dev-dependencies] serial_test = { workspace = true } tokio = { workspace = true } +elementtree = "1.2.3" diff --git a/crates/api/src/lib.rs b/crates/api/src/lib.rs index 1b7b32154..daec7cbc6 100644 --- a/crates/api/src/lib.rs +++ b/crates/api/src/lib.rs @@ -18,6 +18,7 @@ pub mod post_report; pub mod private_message; pub mod private_message_report; pub mod site; +pub mod sitemap; #[async_trait::async_trait(?Send)] pub trait Perform { diff --git a/crates/api/src/sitemap.rs b/crates/api/src/sitemap.rs new file mode 100644 index 000000000..cb47e8e85 --- /dev/null +++ b/crates/api/src/sitemap.rs @@ -0,0 +1,142 @@ +use actix_web::{ + http::header::{self, CacheDirective}, + web::Data, + HttpResponse, +}; +use chrono::{DateTime, FixedOffset}; +use lemmy_api_common::context::LemmyContext; +use lemmy_db_schema::{newtypes::DbUrl, source::post::Post}; +use lemmy_utils::error::LemmyResult; +use sitemap_rs::{url::Url, url_set::UrlSet}; +use tracing::info; + +async fn generate_urlset(posts: Vec<(DbUrl, chrono::NaiveDateTime)>) -> LemmyResult { + let urls = posts + .into_iter() + .map_while(|post| { + Url::builder(post.0.to_string()) + .last_modified(DateTime::from_utc( + post.1, + FixedOffset::east_opt(0).expect("Error setting timezone offset"), // TODO what is the proper timezone offset here? + )) + .build() + .ok() + }) + .collect(); + + Ok(UrlSet::new(urls)?) +} + +pub async fn get_sitemap(context: Data) -> LemmyResult { + info!("Generating sitemap with posts from last {} hours...", 24); + let posts = Post::list_for_sitemap(&mut context.pool()).await?; + info!("Loaded latest {} posts", posts.len()); + + let mut buf = Vec::::new(); + generate_urlset(posts).await?.write(&mut buf)?; + + Ok( + HttpResponse::Ok() + .content_type("application/xml") + .insert_header(header::CacheControl(vec![CacheDirective::MaxAge(86_400)])) // 24 h + .body(buf), + ) +} + +#[cfg(test)] +pub(crate) mod tests { + #![allow(clippy::unwrap_used)] + + use crate::sitemap::generate_urlset; + use chrono::{NaiveDate, NaiveDateTime}; + use elementtree::Element; + use lemmy_db_schema::newtypes::DbUrl; + use url::Url; + + #[tokio::test] + async fn test_generate_urlset() { + let posts: Vec<(DbUrl, NaiveDateTime)> = vec![ + ( + Url::parse("https://example.com").unwrap().into(), + NaiveDate::from_ymd_opt(2022, 12, 1) + .unwrap() + .and_hms_opt(9, 10, 11) + .unwrap(), + ), + ( + Url::parse("https://lemmy.ml").unwrap().into(), + NaiveDate::from_ymd_opt(2023, 1, 1) + .unwrap() + .and_hms_opt(1, 2, 3) + .unwrap(), + ), + ]; + + let mut buf = Vec::::new(); + generate_urlset(posts) + .await + .unwrap() + .write(&mut buf) + .unwrap(); + let root = Element::from_reader(buf.as_slice()).unwrap(); + + assert_eq!(root.tag().name(), "urlset"); + assert_eq!(root.child_count(), 2); + + assert!(root.children().all(|url| url.tag().name() == "url")); + assert!(root.children().all(|url| url.child_count() == 2)); + assert!(root.children().all(|url| url + .children() + .next() + .is_some_and(|element| element.tag().name() == "loc"))); + assert!(root.children().all(|url| url + .children() + .nth(1) + .is_some_and(|element| element.tag().name() == "lastmod"))); + + assert_eq!( + root + .children() + .next() + .unwrap() + .children() + .find(|element| element.tag().name() == "loc") + .unwrap() + .text(), + "https://example.com/" + ); + assert_eq!( + root + .children() + .next() + .unwrap() + .children() + .find(|element| element.tag().name() == "lastmod") + .unwrap() + .text(), + "2022-12-01T09:10:11+00:00" + ); + assert_eq!( + root + .children() + .nth(1) + .unwrap() + .children() + .find(|element| element.tag().name() == "loc") + .unwrap() + .text(), + "https://lemmy.ml/" + ); + assert_eq!( + root + .children() + .nth(1) + .unwrap() + .children() + .find(|element| element.tag().name() == "lastmod") + .unwrap() + .text(), + "2023-01-01T01:02:03+00:00" + ); + } +} diff --git a/crates/db_schema/src/impls/post.rs b/crates/db_schema/src/impls/post.rs index a798ddcb0..630eac426 100644 --- a/crates/db_schema/src/impls/post.rs +++ b/crates/db_schema/src/impls/post.rs @@ -1,3 +1,4 @@ +use super::instance::coalesce; use crate::{ newtypes::{CommunityId, DbUrl, PersonId, PostId}, schema::post::dsl::{ @@ -7,6 +8,7 @@ use crate::{ creator_id, deleted, featured_community, + local, name, post, published, @@ -30,6 +32,7 @@ use crate::{ utils::{get_conn, naive_now, DbPool, DELETED_REPLACEMENT_TEXT, FETCH_LIMIT_MAX}, }; use ::url::Url; +use chrono::{Duration, Utc}; use diesel::{dsl::insert_into, result::Error, ExpressionMethods, QueryDsl, TextExpressionMethods}; use diesel_async::RunQueryDsl; @@ -96,6 +99,21 @@ impl Post { .await } + pub async fn list_for_sitemap( + pool: &mut DbPool<'_>, + ) -> Result, Error> { + let conn = &mut get_conn(pool).await?; + post + .select((ap_id, coalesce(updated, published))) + .filter(local) + .filter(deleted.eq(false)) + .filter(removed.eq(false)) + .filter(published.ge(Utc::now().naive_utc() - Duration::days(1))) + .order(published.desc()) + .load::<(DbUrl, chrono::NaiveDateTime)>(conn) + .await + } + pub async fn permadelete_for_creator( pool: &mut DbPool<'_>, for_creator_id: PersonId, diff --git a/src/api_routes_http.rs b/src/api_routes_http.rs index f4a137d9a..5c8360d0d 100644 --- a/src/api_routes_http.rs +++ b/src/api_routes_http.rs @@ -16,6 +16,7 @@ use lemmy_api::{ local_user::{ban_person::ban_from_site, notifications::mark_reply_read::mark_reply_as_read}, post::{feature::feature_post, like::like_post, lock::lock_post}, post_report::create::create_post_report, + sitemap::get_sitemap, Perform, }; use lemmy_api_common::{ @@ -340,6 +341,11 @@ pub fn config(cfg: &mut web::ServiceConfig, rate_limit: &RateLimitCell) { .route("/delete", web::post().to(delete_custom_emoji)), ), ); + cfg.service( + web::scope("/sitemap.xml") + .wrap(rate_limit.message()) + .route("", web::get().to(get_sitemap)), + ); } async fn perform<'a, Data>(