chore(deps): update chroma, regexp2 v2, replace dimiro1/reply (#37858)

- Update `github.com/alecthomas/chroma/v2` to `v2.25.0`.
- Migrate `github.com/dlclark/regexp2` to `/v2` (incorporates
https://github.com/go-gitea/gitea/pull/37664); drop the renovate pin.
- Replace the unmaintained `github.com/dimiro1/reply` (the last consumer
of `regexp2` v1 in our own code) with a small built-in reply parser for
incoming mail.

Signed-off-by: wxiaoguang <wxiaoguang@gmail.com>
Co-authored-by: Claude (Opus 4.7) <noreply@anthropic.com>
Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>
Co-authored-by: Giteabot <teabot@gitea.io>
Co-authored-by: Nicolas <bircni@icloud.com>
This commit is contained in:
silverwind
2026-05-27 23:39:57 +02:00
committed by GitHub
parent 729c4b8813
commit f810e882a4
8 changed files with 201 additions and 22 deletions
+1 -2
View File
@@ -17,7 +17,6 @@ import (
"gitea.dev/modules/setting"
"gitea.dev/services/mailer/token"
"github.com/dimiro1/reply"
"github.com/emersion/go-imap"
"github.com/emersion/go-imap/client"
"github.com/jhillyerd/enmime/v2"
@@ -356,7 +355,7 @@ func getContentFromMailReader(env *enmime.Envelope) *MailContent {
}
return &MailContent{
Content: reply.FromText(env.Text),
Content: extractReply(env.Text),
Attachments: attachments,
}
}
+53
View File
@@ -150,3 +150,56 @@ func TestGetContentFromMailReader(t *testing.T) {
assert.Equal(t, "mail content without signature", content.Content)
assert.Empty(t, content.Attachments)
}
func TestExtractReply(t *testing.T) {
cases := []struct {
name string
input string
expected string
}{
{"plain text", "Email with only text.", "Email with only text."},
{"crlf normalized", "line one\r\nline two\r\n", "line one\nline two"},
{"trim blank lines", "\n\n\nactual reply\n\n\n", "actual reply"},
{"signature delimiter", "the reply\n--\nJohn Doe\nAcme", "the reply"},
{"rfc signature delimiter", "the reply\n-- \nJohn Doe", "the reply"},
{"mobile signature", "My answer is yes.\n\nSent from my iPhone", "My answer is yes."},
{"quote only kept", "> Email with only quote.", "> Email with only quote."},
{"leading quote kept", "> This is a quote.\n\nAnd this is some text.", "> This is a quote.\n\nAnd this is some text."},
{"trailing quote stripped", "My reply.\n\n> original line 1\n> original line 2", "My reply."},
{"attribution and quote", "Looks good.\n\nOn Mon, Jan 1, 2024 John <j@x.com> wrote:\n> please review", "Looks good."},
{"attribution without quote marks", "My reply.\n\nOn Wed, Sep 25, 2013, richard wrote:\noriginal text", "My reply."},
{"original message separator", "Foo\n\n-------- Original Message --------\n\nTHE END.", "Foo"},
{"outlook header block", "This is the actual reply.\n\nFrom: Some One <a@b.com>\nSent: Monday\nTo: Someone\nSubject: hi\n\nquoted body", "This is the actual reply."},
{"french attribution", "C'est super !\n\nLe 4 janv. 2016 19:03, \"Neil\" <a@b.com> a écrit :\n> quoted", "C'est super !"},
{"german attribution", "Hey :)\n\nAm 03.02.2016 3:35 schrieb Max <a@b.com>:\n> quoted", "Hey :)"},
{"cyrillic wrote verb", "Yes.\n\n6 октября 2014 lidel написал:\n> quoted", "Yes."},
{"localized signature", "My answer.\n\nEnvoyé depuis mon iPhone", "My answer."},
{"swedish header block", "Hi everyone!\n\nFrån: Foo <a@b.com>\nSkickat: den 5 juni\nTill: x@y.com\nÄmne: hi\n\nbody", "Hi everyone!"},
{"attribution only is empty", "On Mon, Jan 1, 2024 at 10:00 John <j@x.com> wrote:\n> please review", ""},
{"prose ending in wrote kept", "Hi Bob,\nThanks for the report you wrote\nI'll fix it.", "Hi Bob,\nThanks for the report you wrote\nI'll fix it."},
{"on with year and no time kept", "Hi,\nOn the 2024 roadmap we have three items.\nPlease review.", "Hi,\nOn the 2024 roadmap we have three items.\nPlease review."},
{"date prose kept", "Notes:\n5 issues 2024 fixed at 9:15 today\nmore notes", "Notes:\n5 issues 2024 fixed at 9:15 today\nmore notes"},
{"header needs from first", "Quick note:\nTo: which server?\nFrom: tests pass.\nThanks", "Quick note:\nTo: which server?\nFrom: tests pass.\nThanks"},
{"indented header block", "Reply text.\n\n From: A <a@b.com>\n Sent: Monday\n To: x\n Subject: hi\n\nbody", "Reply text."},
{"chinese signature", "回复内容\n\n發自我的iPhone", "回复内容"},
{"japanese signature", "返信します\n\niPhoneから送信", "返信します"},
{"chinese header block", "回复内容\n\n发件人:张三\n收件人:李四\n主题:你好\n\n原文", "回复内容"},
{"japanese header block", "本文です\n\n差出人:山田\n宛先:田中\n件名:こんにちは\n\n原文", "本文です"},
{"name-first attribution", "Okay.\n\nErlend <meta@x.com> schrieb am Di., 16. Aug. 2016\num 12:52 Uhr:\n> quoted", "Okay."},
{"chinese attribution", "你好,谢谢回复。\n\n在 2024年1月1日,张三 <z@x.com> 写道:\n> 原始内容", "你好,谢谢回复。"},
{"japanese attribution", "了解しました。\n\n田中さんは書きました:\n> 引用", "了解しました。"},
{"korean attribution", "감사합니다.\n\n홍길동님이 작성:\n> 인용", "감사합니다."},
{"email mention kept", "I asked Bob <bob@x.com> and he wrote back yes.\nSo we proceed.", "I asked Bob <bob@x.com> and he wrote back yes.\nSo we proceed."},
{"trailing mailbox glyph", "My reply here.\n\nᐧ", "My reply here."},
{"on with year and time prose kept", "On the 2024 roadmap we should meet at 10:00.\nI'll send invites.", "On the 2024 roadmap we should meet at 10:00.\nI'll send invites."},
{"spanish year and time prose kept", "El informe del 2024 estará listo a las 10:00.\nGracias.", "El informe del 2024 estará listo a las 10:00.\nGracias."},
{"chinese prose kept", "谢谢,已测试。\n发自我的内心的感谢", "谢谢,已测试。\n发自我的内心的感谢"},
{"korean prose kept", "확인했습니다.\n이 문서는 회사에서 보냄", "확인했습니다.\n이 문서는 회사에서 보냄"},
{"japanese prose kept", "了解しました。\n資料は会議から送信", "了解しました。\n資料は会議から送信"},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
assert.Equal(t, c.expected, extractReply(c.input))
})
}
}
+137
View File
@@ -0,0 +1,137 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package incoming
import (
"regexp"
"strings"
"sync"
"gitea.dev/modules/util"
)
const (
yearToken = `\b\d{4}\b` // 4-digit year
timeToken = `\b\d{1,2}[:.]\d{2}\b` // HH:MM or HH.MM
// "wrote" verbs ending an attribution line; CJK ones are matched without a
// preceding word-separator since those scripts don't space their words
wroteVerbs = `wrote|writes|schrieb|skrev|napisał|escreveu|escribió|написал|пише|a écrit`
cjkWroteVerbs = `写道|寫道|書きました|작성`
// device names anchoring CJK mobile signatures, so prose isn't mistaken for one
cjkDevice = `iphone|ipad|ipod|android|galaxy|手机|手機|平板`
)
// forwarded-mail header fields across the common mail clients/locales. headerFromFields
// (the "From"-equivalents) must begin a block; headerFields is the full set allowed to
// follow. Matched as a prefix by headerLine, so adding a locale is a one-line change.
var (
headerFromFields = []string{
"from", "fra", "de", "von", "da", "van", "från", "expéditeur",
"发件人", "寄件者", "差出人", "보낸사람",
}
headerFields = append([]string{
"to", "cc", "bcc", "sent", "date", "subject", "reply-to",
"til", "emne", "an", "betreff", "gesendet", "para", "assunto", "asunto",
"risposta", "inviato", "oggetto", "destinataire", "objet", "répondre à",
"aan", "onderwerp", "beantwoorden", "skickat", "till", "ämne",
"收件人", "主题", "主旨", "主題", "收件者", "抄送", "日期", "宛先", "件名", "받는사람", "제목",
}, headerFromFields...)
)
// patterns are compiled on first use so the incoming-mail feature adds nothing to startup.
var patterns = sync.OnceValue(func() (ret struct {
signature, attribution, separator *regexp.Regexp
},
) {
// "-- " delimiter and common mobile footers with frequent localizations. The CJK
// forms require a device name so ordinary prose like "发自我的内心" or "会議から送信"
// is not mistaken for a signature.
ret.signature = regexp.MustCompile(`(?i)^(--|__|—` +
`|sent (from|via|with) .+|get outlook for .+` +
`|envoyé depuis mon .+|sendt fra min .+|von meinem .+|verzonden (met|vanaf) .+` +
`|(發|发)自我的.*(` + cjkDevice + `).*` +
`|.*(` + cjkDevice + `).*(から送信|에서 보냄|傳送|发送))$`)
// attribution introducing quoted history: a line ending in a "wrote:" verb
// (Latin/Cyrillic or CJK), a "Name <email> wrote" line, a lead word directly
// followed by a day number or weekday plus a year and a time, or an ISO-date-led
// line. The date phrasing, trailing colon and the email before the verb guard
// against prose (so "On the 2024 roadmap … at 10:00" is not an attribution).
ret.attribution = regexp.MustCompile(`(?i)^>*\s*(` +
`.*[\s">'](` + wroteVerbs + `)\s*[:]` +
`|.*(` + cjkWroteVerbs + `)\s*[:]` +
`|.*<\S+@\S+>\s+(` + wroteVerbs + `)\b.*` +
`|(on|at|le|am|el|em|den|il|op|dnia|w dniu)\b[\s,]*(\d|(?:mon|tue|wed|thu|fri|sat|sun)\b).*` + yearToken + `.*` + timeToken + `.*` +
`|\d{4}-\d{2}-\d{2}\b.*` + timeToken + `.*` +
`)$`)
// a dash/underscore rule line, or text fenced by dashes such as
// "-------- Original Message --------" or "-----Mensaje original-----"
ret.separator = regexp.MustCompile(`(?i)^\s*\*?\s*([-_]{5,}|-{2,}.+-{2,}|original message|forwarded message)\s*\*?\s*$`)
return ret
})
// extractReply returns the user-written part of a plain-text email body, dropping
// quoted history, the reply attribution, signatures and forwarded headers. It is a
// slim, dependency-free reimplementation based on github.com/dimiro1/reply (MIT),
// covering the common mail-client formats and languages; bottom posting and
// forwarded bodies are not handled.
func extractReply(text string) string {
p := patterns()
lines := strings.Split(util.NormalizeStringEOL(text), "\n")
// cut at the first line that begins quoted history, a signature or a header block
for i := range lines {
trimmed := strings.TrimSpace(lines[i])
if p.signature.MatchString(trimmed) || p.attribution.MatchString(trimmed) ||
p.separator.MatchString(trimmed) || headerBlock(trimmed, lines[i+1:]) {
lines = lines[:i]
break
}
}
// drop the trailing block of quoted/blank lines, unless the whole body is quoted
end := len(lines)
for end > 0 {
// "ᐧ" is the trailing marker some mobile clients (Mailbox) append
if t := strings.TrimSpace(lines[end-1]); t != "" && t != "ᐧ" && !strings.HasPrefix(t, ">") {
break
}
end--
}
if end > 0 {
lines = lines[:end]
}
return strings.TrimSpace(strings.Join(lines, "\n"))
}
// headerBlock reports whether a forwarded-mail header block starts here: the
// (already-trimmed) first line is a "From" field and the next non-blank line is
// another field, so a lone "Subject:" sentence is not a boundary.
func headerBlock(first string, rest []string) bool {
if !headerLine(first, headerFromFields) {
return false
}
for _, next := range rest {
if t := strings.TrimSpace(next); t != "" {
return headerLine(t, headerFields)
}
}
return false
}
// headerLine reports whether the already-trimmed line is a "Field:" header for one
// of fields. An ASCII colon must be followed by a space so prose like "To:do this"
// is ignored; the CJK fullwidth colon "" needs no space.
func headerLine(line string, fields []string) bool {
lower := strings.ToLower(line)
for _, field := range fields {
if rest, ok := strings.CutPrefix(lower, field); ok &&
(strings.HasPrefix(rest, ": ") || strings.HasPrefix(rest, "")) {
return true
}
}
return false
}