Skip to content

Commit

Permalink
vmime: prevent loss of a space during text::createFromString (#306)
Browse files Browse the repository at this point in the history
```
mailbox(text("Test München West", charsets::UTF_8), "[email protected]").generate();
```

produces

```
=?us-ascii?Q?Test_?= =?utf-8?Q?M=C3=BCnchen?= =?us-ascii?Q?West?= <[email protected]>
```

The first space between ``Test`` and ``München`` is encoded as an
underscore along with the first word: ``Test_``. The second space
between ``München`` and ``West`` is encoded with neither of the two
words and thus lost. Decoding the text results in ``Test
MünchenWest`` instead of ``Test München West``.

This is caused by how ``vmime::text::createFromString()`` handles
transitions between 7-bit and 8-bit words: If an 8-bit word follows a
7-bit word, a space is appended to the previous word. The opposite
case of a 7-bit word following an 8-bit word *misses* this behaviour.

When one fixes this problem, a follow-up issue appears:

``text::createFromString("a b\xFFc d")`` tokenizes the input into
``m_words={word("a "), word("b\xFFc ", utf8), word("d")}``. This
"right-side alignment" nature of the whitespace is a problem for
word::generate():

As per RFC 2047, spaces between adjacent encoded words are just
separators but not meant to be displayed. A space between an encoded
word and a regular ASCII text is not just a separator but also meant
to be displayed.

When word::generate() outputs the b-word, it would have to strip one
space, but only when there is a transition from encoded-word to
unencoded word. word::generate() does not know whether d will be
encoded or unencoded.

The idea now is that we could change the tokenization of
``text::createFromString`` such that whitespace is at the *start* of
words rather than at the end. With that, word::generate() need not
know anything about the next word, but rather only the *previous*
one.

Thus, in this patch,

1. The tokenization of ``text::createFromString`` is changed to
   left-align spaces and the function is fixed to account for
   the missing space on transition.
2. ``word::generate`` learns how to steal a space character.
3. Testcases are adjusted to account for the shifted
   position of the space.

Fixes: #283, #284

Co-authored-by: Vincent Richard <[email protected]>
  • Loading branch information
jengelh and vincent-richard authored May 21, 2024
1 parent c105165 commit d296c2d
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 66 deletions.
91 changes: 30 additions & 61 deletions src/vmime/text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,84 +285,53 @@ void text::createFromString(const string& in, const charset& ch) {
asciiPercent = (in.length() == 0 ? 100 : (100 * asciiCount) / in.length());
}

// If there are "too much" non-ASCII chars, encode everything
// If there are "too much" non-ASCII chars, produce just one
// vmime::word. Because encoding happens word-wise, all of the input
// gets encoded.

if (alwaysEncode || asciiPercent < 60) { // less than 60% ASCII chars

appendWord(make_shared <word>(in, ch));
return;

// Else, only encode words which need it
} else {

bool is8bit = false; // is the current word 8-bit?
bool prevIs8bit = false; // is previous word 8-bit?
unsigned int count = 0; // total number of words

for (size_t end = in.size(), pos = 0, start = 0 ; ; ) {

if (pos == end || parserHelpers::isSpace(in[pos])) {

const string chunk(in.begin() + start, in.begin() + pos);

if (pos != end) {
++pos;
}

if (is8bit) {

if (count && prevIs8bit) {

// No need to create a new encoded word, just append
// the current word to the previous one.
shared_ptr <word> w = getWordAt(getWordCount() - 1);
w->getBuffer() += " " + chunk;

} else {

if (count) {
shared_ptr <word> w = getWordAt(getWordCount() - 1);
w->getBuffer() += ' ';
}

appendWord(make_shared <word>(chunk, ch));
}

prevIs8bit = true;
++count;
}
// Else, only encode words which need it

} else {
size_t end = in.size(), pos = 0;
bool is8bit = false; // is the current word 8-bit?
bool prevIs8bit = false; // is previous word 8-bit?
unsigned int count = 0; // total number of words

if (count && !prevIs8bit) {
do {
size_t start = pos;

shared_ptr <word> w = getWordAt(getWordCount() - 1);
w->getBuffer() += " " + chunk;
for (; parserHelpers::isSpace(in[pos]); )
++pos;

} else {
for (; pos < end && !parserHelpers::isSpace(in[pos]); ++pos)
is8bit |= !parserHelpers::isAscii(in[pos]);

appendWord(make_shared <word>(chunk, charset(charsets::US_ASCII)));
// All chunks will have whitespace (if any) at front, never at the end
const string chunk(in.begin() + start, in.begin() + pos);

prevIs8bit = false;
++count;
}
}
if (prevIs8bit == is8bit && count > 0) {

if (pos == end) {
break;
}
// same bitness as previous word; merge
auto w = getWordAt(getWordCount() - 1);
w->getBuffer() += chunk;

is8bit = false;
start = pos;
} else {

} else if (!parserHelpers::isAscii(in[pos])) {
appendWord(make_shared <word>(chunk, charset(is8bit ? ch : charsets::US_ASCII)));
++count;

is8bit = true;
++pos;
}

} else {
prevIs8bit = is8bit;
is8bit = false;

++pos;
}
}
}
} while (pos < end);
}


Expand Down
3 changes: 3 additions & 0 deletions src/vmime/word.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -755,6 +755,9 @@ void word::generate(
if (!startNewLine && !state->isFirstWord && !state->lastCharIsSpace) {

os << " "; // Separate from previous word
if (!state->prevWordIsEncoded && m_buffer[0] == ' ')
wordEnc.getNextChunk(1);

++curLineLength;

state->lastCharIsSpace = true;
Expand Down
10 changes: 10 additions & 0 deletions tests/parser/mailboxTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ VMIME_TEST_SUITE_BEGIN(mailboxTest)
VMIME_TEST(testSeparatorInComment)
VMIME_TEST(testMalformations)
VMIME_TEST(testExcessiveQuoting)
VMIME_TEST(testSpacing)
VMIME_TEST_LIST_END


Expand Down Expand Up @@ -184,4 +185,13 @@ VMIME_TEST_SUITE_BEGIN(mailboxTest)
VASSERT_EQ("generate", "=?utf-8?Q?Foo_B=40r?= <[email protected]>", a->generate());
}

void testSpacing() {

vmime::text t("Foo B\xc3\xa4renstark Baz", vmime::charsets::UTF_8);
vmime::mailbox m(t, "[email protected]");
VASSERT_EQ("1", "Foo =?utf-8?Q?B=C3=A4renstark?= Baz", t.generate());
VASSERT_EQ("2", "=?us-ascii?Q?Foo?= =?utf-8?Q?_B=C3=A4renstark?= =?us-ascii?Q?_Baz?= <[email protected]>", m.generate());

}

VMIME_TEST_SUITE_END
10 changes: 5 additions & 5 deletions tests/parser/textTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,11 +189,11 @@ VMIME_TEST_SUITE_BEGIN(textTest)
t2.createFromString(s2, c2);

VASSERT_EQ("2.1", 3, t2.getWordCount());
VASSERT_EQ("2.2", "some ASCII characters and special chars: ", t2.getWordAt(0)->getBuffer());
VASSERT_EQ("2.2", "some ASCII characters and special chars:", t2.getWordAt(0)->getBuffer());
VASSERT_EQ("2.3", vmime::charset(vmime::charsets::US_ASCII), t2.getWordAt(0)->getCharset());
VASSERT_EQ("2.4", "\xc3\xa4\xd0\xb0", t2.getWordAt(1)->getBuffer());
VASSERT_EQ("2.5", c2, t2.getWordAt(1)->getCharset());
VASSERT_EQ("2.6", "and then more ASCII chars.", t2.getWordAt(2)->getBuffer());
VASSERT_EQ("2.6", " and then more ASCII chars.", t2.getWordAt(2)->getBuffer());
VASSERT_EQ("2.7", vmime::charset(vmime::charsets::US_ASCII), t2.getWordAt(2)->getCharset());
}

Expand Down Expand Up @@ -512,9 +512,9 @@ VMIME_TEST_SUITE_BEGIN(textTest)
text.createFromString("Achim Br\xc3\xa4ndt", vmime::charsets::UTF_8);

VASSERT_EQ("1", 2, text.getWordCount());
VASSERT_EQ("2", "Achim ", text.getWordAt(0)->getBuffer());
VASSERT_EQ("2", "Achim", text.getWordAt(0)->getBuffer());
VASSERT_EQ("3", "us-ascii", text.getWordAt(0)->getCharset());
VASSERT_EQ("4", "Br\xc3\xa4ndt", text.getWordAt(1)->getBuffer());
VASSERT_EQ("4", " Br\xc3\xa4ndt", text.getWordAt(1)->getBuffer());
VASSERT_EQ("5", "utf-8", text.getWordAt(1)->getCharset());

// Generate
Expand All @@ -534,7 +534,7 @@ VMIME_TEST_SUITE_BEGIN(textTest)

// Space MUST be encoded inside a word
vmime::mailbox mbox(vmime::text("Achim Br\xc3\xa4ndt", vmime::charsets::UTF_8), "[email protected]");
VASSERT_EQ("generate1", "=?us-ascii?Q?Achim_?= =?utf-8?Q?Br=C3=A4ndt?= <[email protected]>", mbox.generate());
VASSERT_EQ("generate1", "=?us-ascii?Q?Achim?= =?utf-8?Q?_Br=C3=A4ndt?= <[email protected]>", mbox.generate());

vmime::text txt;
txt.appendWord(vmime::make_shared <vmime::word>("Achim ", "us-ascii"));
Expand Down

0 comments on commit d296c2d

Please sign in to comment.