Convert titles for posts from Mastodon to plaintext (fixes #3828) (#4033)

* Convert titles for posts from Mastodon to plaintext (fixes #3828)

* Fix prettier.

* Trigger build

* Convert titles for posts from Mastodon to plaintext (fixes #3828)

* Fix prettier.

* Fix sanizize.

---------

Co-authored-by: Dessalines <dessalines@users.noreply.github.com>
Co-authored-by: Dessalines <tyhou13@gmx.com>
This commit is contained in:
Nutomic 2023-10-13 02:36:02 +02:00 committed by GitHub
parent 645bf21d54
commit 608bb6b1b4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 77 additions and 27 deletions

8
Cargo.lock generated
View file

@ -2719,6 +2719,7 @@ dependencies = [
"enum_delegate",
"futures",
"html2md",
"html2text",
"http",
"itertools 0.11.0",
"lemmy_api_common",
@ -2734,6 +2735,7 @@ dependencies = [
"serde_json",
"serde_with",
"serial_test",
"stringreader",
"strum_macros",
"task-local-extensions",
"tokio",
@ -4950,6 +4952,12 @@ dependencies = [
"unicode-normalization",
]
[[package]]
name = "stringreader"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "913e7b03d63752f6cdd2df77da36749d82669904798fe8944b9ec3d23f159905"
[[package]]
name = "strsim"
version = "0.10.0"

View file

@ -38,6 +38,8 @@ anyhow = { workspace = true }
reqwest = { workspace = true }
once_cell = { workspace = true }
html2md = "0.2.14"
html2text = "0.6.0"
stringreader = "0.1.1"
serde_with = { workspace = true }
enum_delegate = "0.2.0"
moka = { version = "0.11", features = ["future"] }

View file

@ -1,7 +1,7 @@
{
"id": "https://enterprise.lemmy.ml/c/tenforward",
"type": "Group",
"preferredUsername": "main",
"preferredUsername": "tenforward",
"name": "Ten Forward",
"summary": "<p>Lounge and recreation facility</p>\n<hr />\n<p>Welcome to the <a href=\"https://memory-alpha.fandom.com/wiki/USS_Enterprise_(NCC-1701-D)\">Enterprise</a>!.</p>\n",
"source": {

View file

@ -11,40 +11,42 @@
"votersCount": "toot:votersCount"
}
],
"id": "https://mastodon.madrid/users/felix/statuses/107224289116410645",
"id": "https://dice.camp/users/thekernelinyellow/statuses/110830743680706519",
"type": "Note",
"summary": null,
"published": "2021-11-05T11:46:50Z",
"url": "https://mastodon.madrid/@felix/107224289116410645",
"attributedTo": "https://mastodon.madrid/users/felix",
"to": ["https://mastodon.madrid/users/felix/followers"],
"inReplyTo": null,
"published": "2023-08-04T09:55:39Z",
"url": "https://dice.camp/@thekernelinyellow/110830743680706519",
"attributedTo": "https://dice.camp/users/thekernelinyellow",
"to": ["https://www.w3.org/ns/activitystreams#Public"],
"cc": [
"https://www.w3.org/ns/activitystreams#Public",
"https://mamot.fr/users/retiolus"
"https://dice.camp/users/thekernelinyellow/followers",
"https://enterprise.lemmy.ml/c/tenforward",
"https://enterprise.lemmy.ml/c/tenforward/followers"
],
"sensitive": false,
"atomUri": "https://mastodon.madrid/users/felix/statuses/107224289116410645",
"inReplyToAtomUri": "https://mamot.fr/users/retiolus/statuses/107224244380204526",
"conversation": "tag:mamot.fr,2021-11-05:objectId=64635960:objectType=Conversation",
"content": "<p><span class=\"h-card\"><a href=\"https://mamot.fr/@retiolus\" class=\"u-url mention\">@<span>retiolus</span></a></span> i have never been disappointed by a thinkpad. if you want to save money, get a model from a few years ago, there isnt a huge difference anyway.</p>",
"atomUri": "https://dice.camp/users/thekernelinyellow/statuses/110830743680706519",
"inReplyToAtomUri": null,
"conversation": "tag:dice.camp,2023-08-04:objectId=29969291:objectType=Conversation",
"content": "<p><span class=\"h-card\" translate=\"no\"><a href=\"https://enterprise.lemmy.ml/c/tenforward\" class=\"u-url mention\">@<span>tenforward</span></a></span> Variable never resetting at refresh</p><p>Hi! I&#39;m using a variable to count elements in my generator but every time I generate a new character, the counter&#39;s value carries on from the previous one. Is there a function to reset it (I set it to 0 at the beginning of the file)</p>",
"contentMap": {
"en": "<p><span class=\"h-card\"><a href=\"https://mamot.fr/@retiolus\" class=\"u-url mention\">@<span>retiolus</span></a></span> i have never been disappointed by a thinkpad. if you want to save money, get a model from a few years ago, there isnt a huge difference anyway.</p>"
"it": "<p><span class=\"h-card\" translate=\"no\"><a href=\"https://enterprise.lemmy.ml/c/tenforward\" class=\"u-url mention\">@<span>tenforward</span></a></span>Variable never resetting at refresh</p><p>Hi! I&#39;m using a variable to count elements in my generator but every time I generate a new character, the counter&#39;s value carries on from the previous one. Is there a function to reset it (I set it to 0 at the beginning of the file)</p>"
},
"attachment": [],
"tag": [
{
"type": "Mention",
"href": "https://mamot.fr/users/retiolus",
"name": "@retiolus@mamot.fr"
"href": "https://enterprise.lemmy.ml/c/tenforward",
"name": "@tenforward@enterprise.lemmy.ml"
}
],
"replies": {
"id": "https://mastodon.madrid/users/felix/statuses/107224289116410645/replies",
"id": "https://dice.camp/users/thekernelinyellow/statuses/110830743680706519/replies",
"type": "Collection",
"first": {
"type": "CollectionPage",
"next": "https://mastodon.madrid/users/felix/statuses/107224289116410645/replies?only_other_accounts=true&page=true",
"partOf": "https://mastodon.madrid/users/felix/statuses/107224289116410645/replies",
"next": "https://dice.camp/users/thekernelinyellow/statuses/110830743680706519/replies?only_other_accounts=true&page=true",
"partOf": "https://dice.camp/users/thekernelinyellow/statuses/110830743680706519/replies",
"items": []
}
}

View file

@ -7,7 +7,7 @@ use crate::objects::{
use activitypub_federation::{config::Data, fetch::object_id::ObjectId};
use actix_web::web::Json;
use futures::{future::try_join_all, StreamExt};
use lemmy_api_common::{context::LemmyContext, utils::sanitize_html_api_opt, SuccessResponse};
use lemmy_api_common::{context::LemmyContext, SuccessResponse};
use lemmy_db_schema::{
newtypes::DbUrl,
source::{
@ -20,6 +20,7 @@ use lemmy_db_schema::{
post::{PostSaved, PostSavedForm},
},
traits::{Blockable, Crud, Followable, Saveable},
utils::diesel_option_overwrite,
};
use lemmy_db_views::structs::LocalUserView;
use lemmy_utils::{
@ -96,8 +97,8 @@ pub async fn import_settings(
local_user_view: LocalUserView,
context: Data<LemmyContext>,
) -> Result<Json<SuccessResponse>, LemmyError> {
let display_name = Some(sanitize_html_api_opt(&data.display_name));
let bio = Some(sanitize_html_api_opt(&data.bio));
let display_name = diesel_option_overwrite(data.display_name.clone());
let bio = diesel_option_overwrite(data.bio.clone());
let person_form = PersonUpdateForm {
display_name,

View file

@ -21,7 +21,7 @@ use activitypub_federation::{
};
use anyhow::anyhow;
use chrono::{DateTime, Utc};
use html2md::parse_html;
use html2text::{from_read_with_decorator, render::text_renderer::TrivialDecorator};
use lemmy_api_common::{
context::LemmyContext,
request::fetch_site_data,
@ -48,6 +48,7 @@ use lemmy_utils::{
},
};
use std::ops::Deref;
use stringreader::StringReader;
use url::Url;
const MAX_TITLE_LENGTH: usize = 200;
@ -171,11 +172,21 @@ impl Object for ApubPost {
.name
.clone()
.or_else(|| {
// Posts coming from Mastodon or similar platforms don't have a title. Instead we take the
// first line of the content and convert it from HTML to plaintext. We also remove mentions
// of the community name.
page
.content
.clone()
.as_ref()
.and_then(|c| parse_html(c).lines().next().map(ToString::to_string))
.as_deref()
.map(StringReader::new)
.map(|c| from_read_with_decorator(c, MAX_TITLE_LENGTH, TrivialDecorator::new()))
.and_then(|c| {
c.lines().next().map(|s| {
s.replace(&format!("@{}", community.name), "")
.trim()
.to_string()
})
})
})
.ok_or_else(|| anyhow!("Object must have name or content"))?;
if name.chars().count() > MAX_TITLE_LENGTH {
@ -288,8 +299,9 @@ mod tests {
use super::*;
use crate::{
objects::{
community::tests::parse_lemmy_community,
person::tests::parse_lemmy_person,
community::{tests::parse_lemmy_community, ApubCommunity},
instance::ApubSite,
person::{tests::parse_lemmy_person, ApubPerson},
post::ApubPost,
tests::init_context,
},
@ -318,6 +330,31 @@ mod tests {
assert!(!post.featured_community);
assert_eq!(context.request_count(), 0);
cleanup(&context, person, site, community, post).await;
}
#[tokio::test]
#[serial]
async fn test_convert_mastodon_post_title() {
let context = init_context().await;
let (person, site) = parse_lemmy_person(&context).await;
let community = parse_lemmy_community(&context).await;
let json = file_to_json_object("assets/mastodon/objects/page.json").unwrap();
let post = ApubPost::from_json(json, &context).await.unwrap();
assert_eq!(post.name, "Variable never resetting at refresh");
cleanup(&context, person, site, community, post).await;
}
async fn cleanup(
context: &Data<LemmyContext>,
person: ApubPerson,
site: ApubSite,
community: ApubCommunity,
post: ApubPost,
) {
Post::delete(&mut context.pool(), post.id).await.unwrap();
Person::delete(&mut context.pool(), person.id)
.await