Skip to main content

aios_core/
text_analysis.rs

1//! 文本和文件路径分析 — 不保留原文内容。
2//!
3//! 从通知正文提取元数据(长度、文字系统、emoji 检测、语义标签),
4//! 从文件路径提取扩展名类别。所有函数均为纯函数,不持有状态。
5
6use aios_spec::{ExtensionCategory, ScriptHint, SemanticHint, TextHint};
7
8// ===== 文本分析 =====
9
10pub(crate) fn analyze_text(text: &str) -> TextHint {
11    let length_chars = text.chars().count();
12    let is_emoji_only = !text.is_empty() && text.chars().all(is_emoji);
13
14    let script = if text.is_empty() {
15        ScriptHint::Unknown
16    } else {
17        let mut has_latin = false;
18        let mut has_hanzi = false;
19        let mut has_cyrillic = false;
20        let mut has_arabic = false;
21
22        for ch in text.chars() {
23            match ch {
24                '\u{0041}'..='\u{007A}' | '\u{00C0}'..='\u{024F}' => has_latin = true,
25                '\u{4E00}'..='\u{9FFF}'
26                | '\u{3400}'..='\u{4DBF}'
27                | '\u{3000}'..='\u{303F}'
28                | '\u{FF00}'..='\u{FFEF}' => has_hanzi = true,
29                '\u{0400}'..='\u{04FF}' | '\u{0500}'..='\u{052F}' => has_cyrillic = true,
30                '\u{0600}'..='\u{06FF}'
31                | '\u{0750}'..='\u{077F}'
32                | '\u{FB50}'..='\u{FDFF}'
33                | '\u{FE70}'..='\u{FEFF}' => has_arabic = true,
34                _ => {},
35            }
36        }
37
38        let count = [has_latin, has_hanzi, has_cyrillic, has_arabic]
39            .iter()
40            .filter(|&&x| x)
41            .count();
42        match count {
43            0 => ScriptHint::Unknown,
44            1 if has_latin => ScriptHint::Latin,
45            1 if has_hanzi => ScriptHint::Hanzi,
46            1 if has_cyrillic => ScriptHint::Cyrillic,
47            1 if has_arabic => ScriptHint::Arabic,
48            _ => ScriptHint::Mixed,
49        }
50    };
51
52    TextHint {
53        length_chars,
54        script,
55        is_emoji_only,
56    }
57}
58
59/// 从通知标题和正文中提取语义标签。
60///
61/// 关键词匹配在本地完成,不上传原文。
62pub(crate) fn extract_semantic_hints(title: &str, text: &str) -> Vec<SemanticHint> {
63    let combined = format!("{} {}", title, text).to_lowercase();
64    let mut hints = Vec::new();
65
66    // 文件相关
67    if contains_any(
68        &combined,
69        &[
70            "文件",
71            "file",
72            "pdf",
73            "doc",
74            "docx",
75            "xls",
76            "xlsx",
77            "ppt",
78            "pptx",
79            "zip",
80            "rar",
81            "attachment",
82            "附件",
83        ],
84    ) {
85        hints.push(SemanticHint::FileMention);
86    }
87    // 图片相关
88    if contains_any(
89        &combined,
90        &[
91            "图片",
92            "照片",
93            "截图",
94            "image",
95            "photo",
96            "screenshot",
97            "jpg",
98            "jpeg",
99            "png",
100            "gif",
101            "webp",
102            "相册",
103        ],
104    ) {
105        hints.push(SemanticHint::ImageMention);
106    }
107    // 语音相关
108    if contains_any(
109        &combined,
110        &[
111            "语音", "voice", "audio", "mp3", "wav", "aac", "录音", "通话",
112        ],
113    ) {
114        hints.push(SemanticHint::AudioMessage);
115    }
116    // 链接相关
117    if contains_any(&combined, &["http", "https", "www.", "链接", "link", "url"]) {
118        hints.push(SemanticHint::LinkAttachment);
119    }
120    // 被提及 (@我)
121    if contains_any(
122        &combined,
123        &["@你", "@所有人", "提到了你", "mentioned you", "@"],
124    ) {
125        hints.push(SemanticHint::UserMentioned);
126    }
127    // 日历/会议
128    if contains_any(
129        &combined,
130        &[
131            "会议",
132            "meeting",
133            "calendar",
134            "日历",
135            "invitation",
136            "邀请",
137            "schedule",
138            "日程",
139        ],
140    ) {
141        hints.push(SemanticHint::CalendarInvitation);
142    }
143    // 金融/交易
144    if contains_any(
145        &combined,
146        &[
147            "支付",
148            "付款",
149            "转账",
150            "payment",
151            "transaction",
152            "红包",
153            "balance",
154            "余额",
155        ],
156    ) {
157        hints.push(SemanticHint::FinancialContext);
158    }
159    // 验证码
160    if contains_any(
161        &combined,
162        &["验证码", "code", "otp", "验证", "verification", "captcha"],
163    ) {
164        hints.push(SemanticHint::VerificationCode);
165    }
166
167    hints
168}
169
170// ===== 文件路径分析 =====
171
172/// 从文件路径中推断扩展名类别。
173pub(crate) fn classify_extension(path: &str) -> ExtensionCategory {
174    let lower = path.to_lowercase();
175    let ext = std::path::Path::new(&lower)
176        .extension()
177        .and_then(|e| e.to_str())
178        .unwrap_or("");
179
180    match ext {
181        "pdf" | "doc" | "docx" | "xls" | "xlsx" | "ppt" | "pptx" | "txt" | "md" | "csv" | "odt"
182        | "ods" | "odp" => ExtensionCategory::Document,
183        "jpg" | "jpeg" | "png" | "gif" | "webp" | "heic" | "heif" | "bmp" | "svg" | "tiff" => {
184            ExtensionCategory::Image
185        },
186        "mp4" | "mov" | "avi" | "mkv" | "webm" | "flv" | "wmv" | "3gp" => ExtensionCategory::Video,
187        "mp3" | "wav" | "aac" | "flac" | "ogg" | "wma" | "m4a" | "opus" => ExtensionCategory::Audio,
188        "zip" | "rar" | "7z" | "tar" | "gz" | "bz2" | "xz" | "apk" | "aab" => {
189            ExtensionCategory::Archive
190        },
191        "py" | "js" | "ts" | "rs" | "cpp" | "c" | "h" | "java" | "kt" | "swift" | "go" | "so"
192        | "dylib" | "dll" => ExtensionCategory::Code,
193        "" => ExtensionCategory::Unknown,
194        _ => ExtensionCategory::Other,
195    }
196}
197
198// ===== 通用工具 =====
199
200fn contains_any(text: &str, keywords: &[&str]) -> bool {
201    keywords.iter().any(|kw| text.contains(kw))
202}
203
204fn is_emoji(ch: char) -> bool {
205    matches!(ch,
206        '\u{1F600}'..='\u{1F64F}'   // Emoticons
207        | '\u{1F300}'..='\u{1F5FF}' // Misc Symbols and Pictographs
208        | '\u{1F680}'..='\u{1F6FF}' // Transport and Map
209        | '\u{1F900}'..='\u{1F9FF}' // Supplemental Symbols and Pictographs
210        | '\u{2600}'..='\u{26FF}'   // Misc symbols
211        | '\u{2700}'..='\u{27BF}'   // Dingbats
212        | '\u{FE00}'..='\u{FE0F}'   // Variation Selectors
213        | '\u{200D}'                 // ZWJ
214        | '\u{1F1E0}'..='\u{1F1FF}' // Flags
215    )
216}