/src/kcodecs/src/kcodecs.cpp

Source
/*
    SPDX-FileCopyrightText: 2000-2001 Dawit Alemayehu <adawit@kde.org>
    SPDX-FileCopyrightText: 2001 Rik Hemsley (rikkus) <rik@kde.org>
    SPDX-FileCopyrightText: 2001-2002 Marc Mutz <mutz@kde.org>

    SPDX-License-Identifier: LGPL-2.0-only

    The encoding and decoding utilities in KCodecs with the exception of
    quoted-printable are based on the java implementation in HTTPClient
    package by Ronald Tschalär Copyright (C) 1996-1999.                          // krazy:exclude=copyright

    The quoted-printable codec as described in RFC 2045, section 6.7. is by
    Rik Hemsley (C) 2001.
*/

#include "kcodecs.h"
#include "kcharsets.h"
#include "kcodecs_debug.h"
#include "kcodecs_p.h"
#include "kcodecsbase64.h"
#include "kcodecsqp.h"
#include "kcodecsuuencode.h"

#include <array>
#include <cassert>
#include <cstring>
#include <stdlib.h>
#include <string.h>

#include <QDebug>
#include <QStringDecoder>
#include <QStringEncoder>

namespace KCodecs
{
static QList<QByteArray> charsetCache;

QByteArray cachedCharset(const QByteArray &name)
{
    auto it = std::find_if(charsetCache.cbegin(), charsetCache.cend(), [&name](const QByteArray &charset) {
        return name.compare(charset, Qt::CaseInsensitive) == 0;
    });
    if (it != charsetCache.cend()) {
        return *it;
    }

    charsetCache.append(name.toUpper());
    return charsetCache.last();
}

QByteArray cachedCharset(QByteArrayView name)
{
    auto it = std::find_if(charsetCache.cbegin(), charsetCache.cend(), [&name](const QByteArray &charset) {
        return name.compare(charset, Qt::CaseInsensitive) == 0;
    });
    if (it != charsetCache.cend()) {
        return *it;
    }

    charsetCache.append(name.toByteArray().toUpper());
    return charsetCache.last();
}

namespace CodecNames
{
QByteArray utf8()
{
    return QByteArrayLiteral("UTF-8");
}
}

Q_REQUIRED_RESULT
QByteArray updateEncodingCharset(const QByteArray &currentCharset, const QByteArray &nextCharset)
{
    if (!nextCharset.isEmpty()) {
        if (currentCharset.isEmpty()) {
            return nextCharset;
        }
        if (currentCharset != nextCharset) {
            // only one charset per string supported, so change to superset charset UTF-8,
            // which should  cover any possible chars
            return CodecNames::utf8();
        }
    }
    return currentCharset;
}

} // namespace KCodecs

/******************************** KCodecs ********************************/

QByteArray KCodecs::quotedPrintableEncode(QByteArrayView in, bool useCRLF)
{
    Codec *codec = Codec::codecForName("quoted-printable");
    return codec->encode(in, useCRLF ? Codec::NewlineCRLF : Codec::NewlineLF);
}

void KCodecs::quotedPrintableEncode(QByteArrayView in, QByteArray &out, bool useCRLF)
{
    out = quotedPrintableEncode(in, useCRLF ? Codec::NewlineCRLF : Codec::NewlineLF);
}

QByteArray KCodecs::quotedPrintableDecode(QByteArrayView in)
{
    Codec *codec = Codec::codecForName("quoted-printable");
    return codec->decode(in);
}

void KCodecs::quotedPrintableDecode(QByteArrayView in, QByteArray &out)
{
    out = quotedPrintableDecode(in);
}

QByteArray KCodecs::base64Encode(QByteArrayView in)
{
    Codec *codec = Codec::codecForName("base64");
    return codec->encode(in);
}

void KCodecs::base64Encode(QByteArrayView in, QByteArray &out, bool insertLFs)
{
    Q_UNUSED(insertLFs);
    out = base64Encode(in);
}

QByteArray KCodecs::base64Decode(QByteArrayView in)
{
    Codec *codec = Codec::codecForName("base64");
    return codec->decode(in);
}

void KCodecs::base64Decode(const QByteArrayView in, QByteArray &out)
{
    out = base64Decode(in);
}

QByteArray KCodecs::uudecode(QByteArrayView in)
{
    Codec *codec = Codec::codecForName("x-uuencode");
    return codec->decode(in);
}

void KCodecs::uudecode(QByteArrayView in, QByteArray &out)
{
    out = uudecode(in);
}

//@cond PRIVATE

namespace KCodecs
{
// parse the encoded-word (scursor points to after the initial '=')
// retry point is the next position at which a failed parsing of an encoded word
// should be retried from. This is an optimization to avoid quadratic behavior on
// (intentionally) corrupted input
bool parseEncodedWord(const char *&scursor,
                      const char *const send,
                      QString *result,
                      QByteArray *usedCS,
                      const QByteArray &defaultCS,
                      CharsetOption charsetOption,
                      const char *&retryPoint)
{
    assert(result);

    // make sure the caller already did a bit of the work.
    assert(*(scursor - 1) == '=');

    //
    // STEP 1:
    // scan for the charset/language portion of the encoded-word
    //

    retryPoint = nullptr;
    char ch = *scursor++;

    if (ch != '?') {
        // qCDebug(KCODECS_LOG) << "first";
        // qCDebug(KCODECS_LOG) << "Premature end of encoded word";
        return false;
    }

    // remember start of charset (i.e. just after the initial "=?") and
    // language (just after the first '*') fields:
    const char *charsetStart = scursor;
    const char *languageStart = nullptr;

    // find delimiting '?' (and the '*' separating charset and language
    // tags, if any):
    for (; scursor != send; scursor++) {
        if (*scursor == '?') {
            break;
        } else if (*scursor == '*' && languageStart == nullptr) {
            languageStart = scursor + 1;
        }
    }

    // not found? can't be an encoded-word!
    if (scursor == send || *scursor != '?') {
        // qCDebug(KCODECS_LOG) << "second";
        // qCDebug(KCODECS_LOG) << "Premature end of encoded word";
        return false;
    }

    // extract charset information (keep in mind: the size given to the
    // ctor is one off due to the \0 terminator):
    QByteArrayView maybeCharset(charsetStart, std::min<qsizetype>((languageStart ? languageStart - 1 : scursor) - charsetStart, std::strlen(charsetStart)));

    //
    // STEP 2:
    // scan for the encoding portion of the encoded-word
    //

    // remember start of encoding (just _after_ the second '?'):
    scursor++;
    const char *encodingStart = scursor;

    // find next '?' (ending the encoding tag):
    for (; scursor != send; scursor++) {
        if (*scursor == '?') {
            break;
        }
    }

    // not found? Can't be an encoded-word!
    if (scursor == send || *scursor != '?') {
        // qCDebug(KCODECS_LOG) << "third";
        // qCDebug(KCODECS_LOG) << "Premature end of encoded word";
        return false;
    }

    // extract the encoding information:
    QByteArrayView maybeEncoding(encodingStart, scursor - encodingStart);

    // qCDebug(KCODECS_LOG) << "parseEncodedWord: found charset == \"" << maybeCharset
    //         << "\"; language == \"" << maybeLanguage
    //         << "\"; encoding == \"" << maybeEncoding << "\"";

    //
    // STEP 3:
    // scan for encoded-text portion of encoded-word
    //

    // remember start of encoded-text (just after the third '?'):
    scursor++;
    const char *encodedTextStart = scursor;

    // find the '?=' sequence (ending the encoded-text):
    for (; scursor != send; scursor++) {
        if (*scursor == '?') {
            if (scursor + 1 != send) {
                if (*(scursor + 1) != '=') { // We expect a '=' after the '?', but we got something else; ignore
                    // qCDebug(KCODECS_LOG) << "Stray '?' in q-encoded word, ignoring this.";
                    if (*(scursor - 1) == '=') {
                        retryPoint = scursor - 1;
                    }
                    continue;
                } else { // yep, found a '?=' sequence
                    scursor += 2;
                    break;
                }
            } else { // The '?' is the last char, but we need a '=' after it!
                // qCDebug(KCODECS_LOG) << "Premature end of encoded word";
                return false;
            }
        }
    }

    if (*(scursor - 2) != '?' || *(scursor - 1) != '=' || scursor < encodedTextStart + 2) {
        // qCDebug(KCODECS_LOG) << "Premature end of encoded word";
        if (!retryPoint) {
            retryPoint = scursor;
        }
        return false;
    }

    // set end sentinel for encoded-text:
    const char *const encodedTextEnd = scursor - 2;

    //
    // STEP 4:
    // setup decoders for the transfer encoding and the charset
    //

    // try if there's a codec for the encoding found:
    Codec *codec = Codec::codecForName(maybeEncoding);
    if (!codec) {
        // qCDebug(KCODECS_LOG) << "Unknown encoding" << maybeEncoding;
        return false;
    }

    // get an instance of a corresponding decoder:
    Decoder *dec = codec->makeDecoder();
    assert(dec);

    // try if there's a (text)codec for the charset found:
    QByteArray cs;
    QStringDecoder textCodec;
    if (charsetOption == KCodecs::ForceDefaultCharset || maybeCharset.isEmpty()) {
        textCodec = QStringDecoder(defaultCS);
        cs = cachedCharset(defaultCS);
    } else {
        textCodec = QStringDecoder(maybeCharset);
        if (!textCodec.isValid()) { // no suitable codec found => use default charset
            textCodec = QStringDecoder(defaultCS);
            cs = cachedCharset(defaultCS);
        } else {
            cs = cachedCharset(maybeCharset);
        }
    }
    if (usedCS) {
        *usedCS = updateEncodingCharset(*usedCS, cs);
    }

    if (!textCodec.isValid()) {
        // qCDebug(KCODECS_LOG) << "Unknown charset" << maybeCharset;
        delete dec;
        return false;
    };

    // qCDebug(KCODECS_LOG) << "mimeName(): \"" << textCodec->name() << "\"";

    // allocate a temporary buffer to store the 8bit text:
    int encodedTextLength = encodedTextEnd - encodedTextStart;
    QByteArray buffer;
    buffer.resize(codec->maxDecodedSizeFor(encodedTextLength));
    char *bbegin = buffer.data();
    char *bend = bbegin + buffer.length();

    //
    // STEP 5:
    // do the actual decoding
    //

    if (!dec->decode(encodedTextStart, encodedTextEnd, bbegin, bend)) {
        qWarning() << codec->name() << "codec lies about its maxDecodedSizeFor(" << encodedTextLength << ")\nresult may be truncated";
    }

    *result = textCodec.decode(QByteArrayView(buffer.data(), bbegin - buffer.data()));

    // qCDebug(KCODECS_LOG) << "result now: \"" << result << "\"";
    // cleanup:
    delete dec;

    return true;
}

} // namespace KCodecs

//@endcond

QString KCodecs::decodeRFC2047String(QStringView msg)
{
    QByteArray usedCS;
    return decodeRFC2047String(msg.toUtf8(), &usedCS, CodecNames::utf8(), NoOption);
}

QString KCodecs::decodeRFC2047String(QByteArrayView src, QByteArray *usedCS, const QByteArray &defaultCS, CharsetOption charsetOption)
{
    QByteArray result;
    QByteArray spaceBuffer;
    const char *scursor = src.constData();
    const char *send = scursor + src.length();
    bool onlySpacesSinceLastWord = false;
    if (usedCS) {
        usedCS->clear();
    }

    const char *retryPoint = nullptr;
    while (scursor != send) {
        // space
        if (isspace(*scursor) && onlySpacesSinceLastWord) {
            spaceBuffer += *scursor++;
            continue;
        }

        // possible start of an encoded word
        if (*scursor == '=' && (!retryPoint || retryPoint <= scursor)) {
            QString decoded;
            ++scursor;
            const char *start = scursor;
            if (parseEncodedWord(scursor, send, &decoded, usedCS, defaultCS, charsetOption, retryPoint)) {
                result += decoded.toUtf8();
                onlySpacesSinceLastWord = true;
                spaceBuffer.clear();
            } else {
                if (onlySpacesSinceLastWord) {
                    result += spaceBuffer;
                    onlySpacesSinceLastWord = false;
                }
                result += '=';
                scursor = start; // reset cursor after parsing failure
            }
            continue;
        } else {
            // unencoded data
            if (onlySpacesSinceLastWord) {
                result += spaceBuffer;
                onlySpacesSinceLastWord = false;
            }
            result += *scursor;
            ++scursor;
        }
    }
    // If there are any chars that couldn't be decoded in UTF-8,
    // fallback to local codec
    const QString tryUtf8 = QString::fromUtf8(result);
    if (tryUtf8.contains(QChar(0xFFFD))) {
        QStringDecoder codec(QStringDecoder::System);
        if (usedCS) {
            *usedCS = updateEncodingCharset(*usedCS, cachedCharset(QByteArrayView(codec.name())));
        }
        return codec.decode(result);
    } else {
        return tryUtf8;
    }
}

QByteArray KCodecs::encodeRFC2047String(QStringView src, const QByteArray &charset)
{
    return KCodecs::encodeRFC2047String(src, charset, RFC2047EncodingOption::NoOption);
}

static constexpr const char reservedCharacters[] = "\"()<>@,.;:\\[]=";

QByteArray KCodecs::encodeRFC2047String(QStringView src, QByteArrayView charset, KCodecs::RFC2047EncodingOption option)
{
    QByteArray result;
    int start = 0;
    int end = 0;
    bool nonAscii = false;
    bool useQEncoding = false;

    QStringEncoder codec(charset.constData());

    QByteArrayView usedCS;
    if (!codec.isValid()) {
        // no codec available => try local8Bit and hope the best ;-)
        codec = QStringEncoder(QStringEncoder::System);
        usedCS = codec.name();
    } else {
        Q_ASSERT(codec.isValid());
        if (charset.isEmpty()) {
            usedCS = codec.name();
        } else {
            usedCS = charset;
        }
    }

    const QByteArray encoded8Bit = [&] { // encoded8Bit must be const to not detach below!
        QByteArray encoded8Bit = codec.encode(src);
        if (codec.hasError()) {
            usedCS = CodecNames::utf8();
            codec = QStringEncoder(QStringEncoder::Utf8);
            encoded8Bit = codec.encode(src);
        }
        return encoded8Bit;
    }();

    if (usedCS.contains("8859-")) { // use "B"-Encoding for non iso-8859-x charsets
        useQEncoding = true;
    }

    uint encoded8BitLength = encoded8Bit.length();
    for (unsigned int i = 0; i < encoded8BitLength; i++) {
        if (encoded8Bit[i] == ' ') { // encoding starts at word boundaries
            start = i + 1;
        }

        // encode escape character, for japanese encodings...
        if (((signed char)encoded8Bit[i] < 0) || (encoded8Bit[i] == '\033')
            || (option == RFC2047EncodingOption::EncodeReservedCharcters && (std::strchr(reservedCharacters, encoded8Bit[i]) != nullptr))) {
            end = start; // non us-ascii char found, now we determine where to stop encoding
            nonAscii = true;
            break;
        }
    }

    if (nonAscii) {
        while ((end < encoded8Bit.length()) && (encoded8Bit[end] != ' ')) {
            // we encode complete words
            end++;
        }

        for (int x = end; x < encoded8Bit.length(); x++) {
            if (((signed char)encoded8Bit[x] < 0) || (encoded8Bit[x] == '\033')
                || (option == RFC2047EncodingOption::EncodeReservedCharcters && (std::strchr(reservedCharacters, encoded8Bit[x]) != nullptr))) {
                end = x; // we found another non-ascii word

                while ((end < encoded8Bit.length()) && (encoded8Bit[end] != ' ')) {
                    // we encode complete words
                    end++;
                }
                x = end;
            }
        }

        result = QByteArrayView(encoded8Bit).left(start) + "=?" + usedCS;

        if (useQEncoding) {
            result += "?Q?";

            char hexcode; // "Q"-encoding implementation described in RFC 2047
            for (int i = start; i < end; i++) {
                const char c = encoded8Bit[i];
                if (c == ' ') { // make the result readable with not MIME-capable readers
                    result += '_';
                } else {
                    if (((c >= 'a') && (c <= 'z')) || // paranoid mode, encode *all* special chars to avoid problems
                        ((c >= 'A') && (c <= 'Z')) || // with "From" & "To" headers
                        ((c >= '0') && (c <= '9'))) {
                        result += c;
                    } else {
                        result += '='; // "stolen" from KMail ;-)
                        hexcode = ((c & 0xF0) >> 4) + 48;
                        if (hexcode >= 58) {
                            hexcode += 7;
                        }
                        result += hexcode;
                        hexcode = (c & 0x0F) + 48;
                        if (hexcode >= 58) {
                            hexcode += 7;
                        }
                        result += hexcode;
                    }
                }
            }
        } else {
            result += "?B?" + encoded8Bit.mid(start, end - start).toBase64();
        }

        result += "?=";
        result += QByteArrayView(encoded8Bit).right(encoded8Bit.length() - end);
    } else {
        result = encoded8Bit;
    }

    return result;
}

/******************************************************************************/
/*                           KCodecs::Codec                                   */

KCodecs::Codec *KCodecs::Codec::codecForName(QByteArrayView name)
{
    struct CodecEntry {
        const char *name;
        std::unique_ptr<KCodecs::Codec> codec;
    };
    static const std::array<CodecEntry, 6> s_codecs{{
        {"b", std::make_unique<KCodecs::Rfc2047BEncodingCodec>()},
        {"base64", std::make_unique<KCodecs::Base64Codec>()},
        {"q", std::make_unique<KCodecs::Rfc2047QEncodingCodec>()},
        {"quoted-printable", std::make_unique<KCodecs::QuotedPrintableCodec>()},
        {"x-kmime-rfc2231", std::make_unique<KCodecs::Rfc2231EncodingCodec>()},
        {"x-uuencode", std::make_unique<KCodecs::UUCodec>()},
    }};

    for (auto &entry : s_codecs) {
        if (name.compare(entry.name, Qt::CaseInsensitive) == 0) {
            return entry.codec.get();
        }
    }
    qWarning() << "Unknown codec" << name << "requested!";
    return nullptr;
}

bool KCodecs::Codec::encode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend, NewlineType newline) const
{
    // get an encoder:
    std::unique_ptr<Encoder> enc(makeEncoder(newline));
    if (!enc) {
        qWarning() << "makeEncoder failed for" << name();
        return false;
    }

    // encode and check for output buffer overflow:
    while (!enc->encode(scursor, send, dcursor, dend)) {
        if (dcursor == dend) {
            return false; // not enough space in output buffer
        }
    }

    // finish and check for output buffer overflow:
    while (!enc->finish(dcursor, dend)) {
        if (dcursor == dend) {
            return false; // not enough space in output buffer
        }
    }

    return true; // successfully encoded.
}

QByteArray KCodecs::Codec::encode(QByteArrayView src, NewlineType newline) const
{
    // allocate buffer for the worst case:
    QByteArray result;
    result.resize(maxEncodedSizeFor(src.size(), newline));

    // set up iterators:
    QByteArray::ConstIterator iit = src.begin();
    QByteArray::ConstIterator iend = src.end();
    QByteArray::Iterator oit = result.begin();
    QByteArray::ConstIterator oend = result.end();

    // encode
    if (!encode(iit, iend, oit, oend, newline)) {
        qCritical() << name() << "codec lies about it's mEncodedSizeFor()";
    }

    // shrink result to actual size:
    result.truncate(oit - result.begin());

    return result;
}

QByteArray KCodecs::Codec::decode(QByteArrayView src, NewlineType newline) const
{
    // allocate buffer for the worst case:
    QByteArray result;
    result.resize(maxDecodedSizeFor(src.size(), newline));

    // set up iterators:
    QByteArray::ConstIterator iit = src.begin();
    QByteArray::ConstIterator iend = src.end();
    QByteArray::Iterator oit = result.begin();
    QByteArray::ConstIterator oend = result.end();

    // decode
    if (!decode(iit, iend, oit, oend, newline)) {
        qCritical() << name() << "codec lies about it's maxDecodedSizeFor()";
    }

    // shrink result to actual size:
    result.truncate(oit - result.begin());

    return result;
}

bool KCodecs::Codec::decode(const char *&scursor, const char *const send, char *&dcursor, const char *const dend, NewlineType newline) const
{
    // get a decoder:
    std::unique_ptr<Decoder> dec(makeDecoder(newline));
    assert(dec);

    // decode and check for output buffer overflow:
    while (!dec->decode(scursor, send, dcursor, dend)) {
        if (dcursor == dend) {
            return false; // not enough space in output buffer
        }
    }

    // finish and check for output buffer overflow:
    while (!dec->finish(dcursor, dend)) {
        if (dcursor == dend) {
            return false; // not enough space in output buffer
        }
    }

    return true; // successfully encoded.
}

/******************************************************************************/
/*                          KCodecs::Encoder                                  */

KCodecs::EncoderPrivate::EncoderPrivate(Codec::NewlineType newline)
    : outputBufferCursor(0)
    , newline(newline)
{
}

KCodecs::Encoder::Encoder(Codec::NewlineType newline)
    : d(new KCodecs::EncoderPrivate(newline))
{
}

KCodecs::Encoder::~Encoder() = default;

bool KCodecs::Encoder::write(char ch, char *&dcursor, const char *const dend)
{
    if (dcursor != dend) {
        // if there's space in the output stream, write there:
        *dcursor++ = ch;
        return true;
    } else {
        // else buffer the output:
        if (d->outputBufferCursor >= maxBufferedChars) {
            qCritical() << "KCodecs::Encoder: internal buffer overflow!";
        } else {
            d->outputBuffer[d->outputBufferCursor++] = ch;
        }
        return false;
    }
}

// write as much as possible off the output buffer. Return true if
// flushing was complete, false if some chars could not be flushed.
bool KCodecs::Encoder::flushOutputBuffer(char *&dcursor, const char *const dend)
{
    int i;
    // copy output buffer to output stream:
    for (i = 0; dcursor != dend && i < d->outputBufferCursor; ++i) {
        *dcursor++ = d->outputBuffer[i];
    }

    // calculate the number of missing chars:
    int numCharsLeft = d->outputBufferCursor - i;
    // push the remaining chars to the beginning of the buffer:
    if (numCharsLeft) {
        ::memmove(d->outputBuffer, d->outputBuffer + i, numCharsLeft);
    }
    // adjust cursor:
    d->outputBufferCursor = numCharsLeft;

    return !numCharsLeft;
}

bool KCodecs::Encoder::writeCRLF(char *&dcursor, const char *const dend)
{
    if (d->newline == Codec::NewlineCRLF) {
        write('\r', dcursor, dend);
    }
    return write('\n', dcursor, dend);
}

/******************************************************************************/
/*                           KCodecs::Decoder                                 */

KCodecs::DecoderPrivate::DecoderPrivate(Codec::NewlineType newline)
    : newline(newline)
{
}

KCodecs::Decoder::Decoder(Codec::NewlineType newline)
    : d(new KCodecs::DecoderPrivate(newline))
{
}

KCodecs::Decoder::~Decoder() = default;

Coverage Report

Created: 2026-05-27 07:07