Skip to content

Commit

Permalink
Implement new lightweight search index for searching messages (#7029)
Browse files Browse the repository at this point in the history
  • Loading branch information
hpeebles authored Dec 10, 2024
1 parent 700fdce commit 9a7b4d0
Show file tree
Hide file tree
Showing 19 changed files with 348 additions and 214 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions backend/canisters/community/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Switch to using `PrincipalToStableMemoryMap` ([#7023](https://github.com/open-chat-labs/open-chat/pull/7023))
- Reduce size by grouping member -> channel links per userId ([#7025](https://github.com/open-chat-labs/open-chat/pull/7025))
- Merge member Ids set with channel links map ([#7027](https://github.com/open-chat-labs/open-chat/pull/7027))
- Implement new lightweight search index for searching messages ([#7029](https://github.com/open-chat-labs/open-chat/pull/7029))
- Make `MessageId` comparisons use their 64bit representation ([#7030](https://github.com/open-chat-labs/open-chat/pull/7030))

### Removed
Expand Down
5 changes: 2 additions & 3 deletions backend/canisters/community/impl/src/model/channels.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
use super::members::CommunityMembers;
use chat_events::Reader;
use group_chat_core::{CanLeaveResult, GroupChatCore, GroupMemberInternal, LeaveResult};
use rand::rngs::StdRng;
use rand::{Rng, RngCore};
use search::*;
use search::weighted::*;
use serde::{Deserialize, Serialize};
use std::cmp::{max, Reverse};
use std::collections::hash_map::Entry::Vacant;
Expand All @@ -13,8 +14,6 @@ use types::{
UserId, UserType, MAX_THREADS_IN_SUMMARY,
};

use super::members::CommunityMembers;

#[derive(Serialize, Deserialize, Default)]
pub struct Channels {
channels: HashMap<ChannelId, Channel>,
Expand Down
2 changes: 1 addition & 1 deletion backend/canisters/community/impl/src/model/members.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ pub struct CommunityMembers {
members_and_channels: BTreeMap<UserId, Vec<ChannelId>>,
member_channel_links_removed: BTreeMap<(UserId, ChannelId), TimestampMillis>,
user_groups: UserGroups,
// This includes the userIds of community members and also users invited to the community
#[serde(deserialize_with = "deserialize_principal_to_user_id_map_from_heap")]
principal_to_user_id_map: PrincipalToUserIdMap,
// This includes the userIds of community members and also users invited to the community
#[deprecated]
member_ids: Vec<UserId>,
owners: BTreeSet<UserId>,
Expand Down
1 change: 1 addition & 0 deletions backend/canisters/group/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Ensure bot has permission to execute given action ([#7014](https://github.com/open-chat-labs/open-chat/pull/7014))
- Allow bots to send a subset of message types ([#7016](https://github.com/open-chat-labs/open-chat/pull/7016))
- Switch to using `PrincipalToStableMemoryMap` ([#7023](https://github.com/open-chat-labs/open-chat/pull/7023))
- Implement new lightweight search index for searching messages ([#7029](https://github.com/open-chat-labs/open-chat/pull/7029))
- Make `MessageId` comparisons use their 64bit representation ([#7030](https://github.com/open-chat-labs/open-chat/pull/7030))

## [[2.0.1501](https://github.com/open-chat-labs/open-chat/releases/tag/v2.0.1501-group)] - 2024-12-06
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::model::moderation_flags::ModerationFlags;
use crate::model::private_communities::PrivateCommunityInfo;
use crate::MARK_ACTIVE_DURATION;
use search::{Document, Query};
use search::weighted::{Document, Query};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use types::{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::model::cached_hot_groups::CachedPublicGroupSummary;
use crate::model::private_groups::PrivateGroupInfo;
use crate::{CACHED_HOT_GROUPS_COUNT, MARK_ACTIVE_DURATION};
use constants::DAY_IN_MS;
use search::*;
use search::weighted::*;
use serde::{Deserialize, Serialize};
use std::cmp;
use std::collections::HashMap;
Expand Down
1 change: 1 addition & 0 deletions backend/canisters/user/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Expose size of each virtual stable memory in metrics ([#6981](https://github.com/open-chat-labs/open-chat/pull/6981))
- Include the ledger canister Id in transfer failed error logs ([#7011](https://github.com/open-chat-labs/open-chat/pull/7011))
- Send user's principal to group/community when leaving ([#7023](https://github.com/open-chat-labs/open-chat/pull/7023))
- Implement new lightweight search index for searching messages ([#7029](https://github.com/open-chat-labs/open-chat/pull/7029))
- Make `MessageId` comparisons use their 64bit representation ([#7030](https://github.com/open-chat-labs/open-chat/pull/7030))

### Removed
Expand Down
4 changes: 2 additions & 2 deletions backend/canisters/user/impl/src/queries/search_messages.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::guards::caller_is_owner;
use crate::{read_state, RuntimeState};
use canister_api_macros::query;
use search::Query;
use search::simple::Query;
use std::collections::HashSet;
use types::MessageIndex;
use user_canister::search_messages::{Response::*, *};
Expand Down Expand Up @@ -30,7 +30,7 @@ fn search_messages_impl(args: Args, state: &RuntimeState) -> Response {
Some(dc) => dc,
};

let query = Query::parse(args.search_term);
let query = Query::new(&args.search_term);

let matches = direct_chat
.events
Expand Down
2 changes: 1 addition & 1 deletion backend/canisters/user_index/impl/src/model/user_map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use crate::model::diamond_membership_details::DiamondMembershipDetailsInternal;
use crate::model::user::User;
use crate::DiamondMembershipUserMetrics;
use candid::Principal;
use search::{Document as SearchDocument, Query};
use search::weighted::{Document as SearchDocument, Query};
use serde::{Deserialize, Serialize};
use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::ops::RangeFrom;
Expand Down
2 changes: 1 addition & 1 deletion backend/libraries/chat_events/src/chat_events.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use constants::{HOUR_IN_MS, ONE_MB, OPENCHAT_BOT_USER_ID};
use event_store_producer::{EventBuilder, EventStoreClient, Runtime};
use rand::rngs::StdRng;
use rand::Rng;
use search::{Document, Query};
use search::simple::{Document, Query};
use serde::{Deserialize, Serialize};
use serde_bytes::ByteBuf;
use sha2::{Digest, Sha256};
Expand Down
34 changes: 17 additions & 17 deletions backend/libraries/chat_events/src/message_content_internal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::DeletedByInternal;
use candid::Principal;
use constants::{MEMO_PRIZE_FEE, MEMO_PRIZE_REFUND, OPENCHAT_TREASURY_CANISTER_ID, PRIZE_FEE_PERCENT};
use ledger_utils::{create_pending_transaction, format_crypto_amount};
use search::Document;
use search::simple::Document;
use serde::{Deserialize, Serialize};
use serde_bytes::ByteBuf;
use std::collections::{HashMap, HashSet};
Expand Down Expand Up @@ -279,28 +279,28 @@ impl From<&MessageContentInternal> for Document {

fn try_add_caption(document: &mut Document, caption_option: Option<&String>) {
if let Some(caption) = caption_option {
document.add_field(caption.to_owned(), 1.0, false);
document.add_field(caption);
}
}

fn try_add_caption_and_mime_type(document: &mut Document, caption_option: Option<&String>, mime_type: &str) {
document.add_field(mime_type.to_owned(), 1.0, false);
document.add_field(mime_type);
try_add_caption(document, caption_option);
}

match message_content {
MessageContentInternal::Text(c) => {
document.add_field(c.text.clone(), 1.0, false);
document.add_field(&c.text);
}
MessageContentInternal::Crypto(c) => {
let token = c.transfer.token();
document.add_field(token.token_symbol().to_string(), 1.0, false);
document.add_field(token.token_symbol());

let amount = c.transfer.units();
// This is only used for string searching so it's better to default to 8 than to trap
let decimals = c.transfer.token().decimals().unwrap_or(8);
let amount_string = format_crypto_amount(amount, decimals);
document.add_field(amount_string, 1.0, false);
document.add_field(&amount_string);

try_add_caption(&mut document, c.caption.as_ref())
}
Expand All @@ -310,32 +310,32 @@ impl From<&MessageContentInternal> for Document {
MessageContentInternal::File(c) => try_add_caption_and_mime_type(&mut document, c.caption.as_ref(), &c.mime_type),
MessageContentInternal::Giphy(c) => try_add_caption(&mut document, c.caption.as_ref()),
MessageContentInternal::Poll(p) => {
document.add_field("poll".to_string(), 1.0, false);
if let Some(text) = p.config.text.clone() {
document.add_field(text, 1.0, false);
document.add_field("poll");
if let Some(text) = &p.config.text {
document.add_field(text);
}
}
MessageContentInternal::GovernanceProposal(p) => {
document.add_field(p.proposal.title().to_string(), 1.0, false);
document.add_field(p.proposal.summary().to_string(), 1.0, false);
document.add_field(p.proposal.title());
document.add_field(p.proposal.summary());
}
MessageContentInternal::Prize(c) => {
document.add_field(c.transaction.token().token_symbol().to_string(), 1.0, false);
document.add_field(c.transaction.token().token_symbol());
try_add_caption(&mut document, c.caption.as_ref())
}
MessageContentInternal::PrizeWinner(c) => {
document.add_field(c.token_symbol.clone(), 1.0, false);
document.add_field(&c.token_symbol);
}
MessageContentInternal::MessageReminderCreated(r) => try_add_caption(&mut document, r.notes.as_ref()),
MessageContentInternal::MessageReminder(r) => try_add_caption(&mut document, r.notes.as_ref()),
MessageContentInternal::P2PSwap(p) => {
document.add_field("swap".to_string(), 1.0, false);
document.add_field(p.token0.token.token_symbol().to_string(), 1.0, false);
document.add_field(p.token1.token.token_symbol().to_string(), 1.0, false);
document.add_field("swap");
document.add_field(p.token0.token.token_symbol());
document.add_field(p.token1.token.token_symbol());
try_add_caption(&mut document, p.caption.as_ref())
}
MessageContentInternal::Custom(c) => {
document.add_field(c.kind.clone(), 1.0, false);
document.add_field(&c.kind);
}
MessageContentInternal::ReportedMessage(_)
| MessageContentInternal::Deleted(_)
Expand Down
48 changes: 46 additions & 2 deletions backend/libraries/chat_events/src/search_index.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
use search::{Document, Query};
use serde::{Deserialize, Serialize};
use search::simple::{Document, Query};
use serde::de::{MapAccess, Visitor};
use serde::{Deserialize, Deserializer, Serialize};
use std::collections::{BTreeMap, HashSet};
use std::fmt::Formatter;
use types::{MessageIndex, UserId};

#[derive(Serialize, Deserialize, Default)]
pub struct SearchIndex {
#[serde(deserialize_with = "deserialize_weighted_search_map")]
map: BTreeMap<MessageIndex, (UserId, Document)>,
}

Expand Down Expand Up @@ -32,3 +35,44 @@ impl SearchIndex {
.map(|(id, _)| *id)
}
}

struct SearchIndexVisitor;

impl<'de> Visitor<'de> for SearchIndexVisitor {
type Value = BTreeMap<MessageIndex, (UserId, Document)>;

fn expecting(&self, formatter: &mut Formatter) -> std::fmt::Result {
formatter.write_str("a map")
}

fn visit_map<A>(self, mut map: A) -> Result<Self::Value, A::Error>
where
A: MapAccess<'de>,
{
let mut result: BTreeMap<MessageIndex, (UserId, Document)> = BTreeMap::new();
while let Some((message_index, (user_id, doc))) = map.next_entry()? {
result.insert(message_index, (user_id, convert_to_simple_doc(doc)));
}
Ok(result)
}
}

#[derive(Deserialize)]
#[serde(untagged)]
enum DocumentCombined {
Weighted(search::weighted::Document),
Simple(Document),
}

fn convert_to_simple_doc(doc: DocumentCombined) -> Document {
match doc {
DocumentCombined::Weighted(doc) => doc.into(),
DocumentCombined::Simple(doc) => doc,
}
}

fn deserialize_weighted_search_map<'de, D: Deserializer<'de>>(
d: D,
) -> Result<BTreeMap<MessageIndex, (UserId, Document)>, D::Error> {
d.deserialize_map(SearchIndexVisitor)
}
4 changes: 2 additions & 2 deletions backend/libraries/group_chat_core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use group_community_common::{BotUpdate, GroupBots, MemberUpdate};
use itertools::Itertools;
use lazy_static::lazy_static;
use regex_lite::Regex;
use search::Query;
use search::simple::Query;
use serde::{Deserialize, Serialize};
use std::cmp::{max, min, Reverse};
use std::collections::{BTreeMap, BTreeSet, HashSet};
Expand Down Expand Up @@ -590,7 +590,7 @@ impl GroupChatCore {
Some(p) => p,
};

let query = Query::parse(search_term);
let query = Query::new(&search_term);

let matches = self
.events
Expand Down
18 changes: 9 additions & 9 deletions backend/libraries/principal_to_user_id_map/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use ic_principal::Principal;
use serde::de::SeqAccess;
use serde::de::MapAccess;
use serde::{Deserialize, Deserializer, Serialize};
use stable_memory_map::{with_map, with_map_mut, KeyPrefix, PrincipalToUserIdKeyPrefix};
use std::fmt::Formatter;
Expand Down Expand Up @@ -40,7 +40,7 @@ impl PrincipalToUserIdMap {
pub fn deserialize_principal_to_user_id_map_from_heap<'de, D: Deserializer<'de>>(
d: D,
) -> Result<PrincipalToUserIdMap, D::Error> {
d.deserialize_seq(Visitor)
d.deserialize_map(Visitor)
}

struct Visitor;
Expand All @@ -49,18 +49,18 @@ impl<'a> serde::de::Visitor<'a> for Visitor {
type Value = PrincipalToUserIdMap;

fn expecting(&self, formatter: &mut Formatter) -> std::fmt::Result {
formatter.write_str("a sequence of (Principal, UserId)")
formatter.write_str("a map of (Principal, UserId)")
}

fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
fn visit_map<A>(self, mut map: A) -> Result<Self::Value, A::Error>
where
A: SeqAccess<'a>,
A: MapAccess<'a>,
{
let mut map = PrincipalToUserIdMap::default();
while let Some((principal, user_id)) = seq.next_element()? {
map.insert(principal, user_id);
let mut result = PrincipalToUserIdMap::default();
while let Some((principal, user_id)) = map.next_entry()? {
result.insert(principal, user_id);
}
Ok(map)
Ok(result)
}
}

Expand Down
3 changes: 3 additions & 0 deletions backend/libraries/search/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@ edition = "2021"

[dependencies]
serde = { workspace = true, features = ["derive"] }

[dev-dependencies]
test-case = { workspace = true }
Loading

0 comments on commit 9a7b4d0

Please sign in to comment.