OpenRTX/lib/miosix-kernel/miosix/util/unicode.h

/***************************************************************************
 *   Copyright (C) 2013 by Terraneo Federico                               *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   As a special exception, if other files instantiate templates or use   *
 *   macros or inline functions from this file, or you compile this file   *
 *   and link it with other works to produce a work based on this file,    *
 *   this file does not by itself cause the resulting work to be covered   *
 *   by the GNU General Public License. However the source code for this   *
 *   file must still be made available in accordance with the GNU General  *
 *   Public License. This exception does not invalidate any other reasons  *
 *   why a work based on this file might be covered by the GNU General     *
 *   Public License.                                                       *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, see <http://www.gnu.org/licenses/>   *
 ***************************************************************************/

#include <stdint.h>
#include <utility>

#ifndef UNICODE_H
#define UNICODE_H

#if __cplusplus <= 199711L
//These are builtin types in C++11, add them if compiling in C++03 mode
typedef uint16_t char16_t;
typedef uint32_t char32_t;
#endif // !c++11

namespace miosix {

/**
 * Result codes for unicode related conversion stuff
 */
class Unicode
{
public:
    /**
     * Possible errors for unicode string conversion
     */
    enum error
    {
        OK,                 ///< The string conversion completed successfully
        INSUFFICIENT_SPACE, ///< The source string is too long to fit
        INVALID_STRING      ///< The source string is an illegal unicode string
    };

    /// Represents an invalid code point
    static const char32_t invalid=0xffffffff;

    /**
     * Peek an unicode code point out of an iterator into an utf8 string
     * \param it an iterator into an utf8 encoded string
     * \param end iterator one past the last character of the string
     * \return an unicode code point, or Unicode::invalid if the string
     * contains an invalid code point. Returns 0 if the end of string is found,
     * and it is not in the middle of a character
     */
    template<typename Iter>
    static char32_t nextUtf8(Iter& it, Iter end)
    {
        return nextUtf8(it,end,true);
    }

    /**
     * Peek an unicode code point out of an iterator into an utf8 string
     * \param it an iterator into an utf8 encoded string, the string is assumed
     * to be nul-terminated
     * \return an unicode code point, or Unicode::invalid if the string
     * contains an invalid code point. Returns 0 if the end of string is found,
     * and it is not in the middle of a character
     */
    template<typename Iter>
    static char32_t nextUtf8(Iter& it)
    {
        return nextUtf8(it,it,false);
    }

    /**
     * Put an unicode code point into a character array, converting it to utf8.
     * \param dst pointer to the buffer where the character is to be written
     * \param c an unicode code point (utf32 char)
     * \param dstSize number of bytes available in dst
     * \return an error code and the number of bytes of dst that were used up to
     * write src to dst
     */
    static std::pair<error,int> putUtf8(char *dst, char32_t c, int dstSize);

    /**
     * Convert an utf8 string in an utf16 one
     * \param dst an utf16 string in system-dependent endianness (i.e: little
     * endian in a little endian machine and big endian in a big endian one)
     * \param dstSize size in units of char16_t of dst, to prevent overflow
     * \param src a nul-terminated utf8 string
     * \return an error code and the length (in units of char16_t) of the
     * string written to dst
     */
    static std::pair<error,int> utf8toutf16(char16_t *dst, int dstSize,
        const char *src);

    /**
     * Convert an utf16 string in an utf8 one
     * \param dst an utf8 string
     * \param dstSize size in bytes of dst, to prevent overflow
     * \param src a nul-terminated utf16 string in system-dependent endianness
     * (i.e: little endian in a little endian machine and big endian in a big
     * endian one)
     * \return an error code and the length of the string written to dst
     */
    static std::pair<error,int> utf16toutf8(char *dst, int dstSize,
        const char16_t *src);

    /**
     * \param str an utf8 encoded string
     * \return a pair with a bool that is true if the string is valid, and the
     * string length in bytes, not code points
     */
    static std::pair<bool,int> validateUtf8(const char *str);

private:
    /**
     * Common implementation of nextUtf8
     * \param it an iterator into an utf8 encoded string
     * \param end iterator one past the last character of the string
     * \param checkEnd true if there is the need to check for end of string
     * considering end. If false, a nul in the char stream is the only end
     * condition.
     * \return an unicode code point, or Unicode::invalid if the string
     * contains an invalid code point. Returns 0 if the end of string is found,
     * and it is not in the middle of a character
     */
    template<typename Iter>
    static char32_t nextUtf8(Iter& it, Iter end, bool checkEnd);
};

template<typename Iter>
char32_t Unicode::nextUtf8(Iter& it, Iter end, bool checkEnd)
{
    //End of string at the beginning, return 0
    if(checkEnd && it==end) return 0;

    //Note: cast to unsigned char to prevent sign extension if *it > 0x7f
    char32_t c=static_cast<unsigned char>(*it++);

    //Common case first: ASCII
    if(c<0x80) return c;

    //If not ASCII, decode to utf32
    int additionalBytes;
    if((c & 0xe0)==0xc0)      { c &= 0x1f; additionalBytes=1; } //110xxxxx
    else if((c & 0xf0)==0xe0) { c &= 0x0f; additionalBytes=2; } //1110xxxx
    else if((c & 0xf8)==0xf0) { c &= 0x07; additionalBytes=3; } //11110xxx
    else return invalid;
    for(int i=0;i<additionalBytes;i++)
    {
        //End of string in the middle of a char, return invalid
        if(checkEnd && it==end) return invalid;
        char32_t next=static_cast<unsigned char>(*it++);
        //This includes the case next==0
        if((next & 0xc0)!=0x80) return invalid;
        c<<=6;
        c |= next & 0x3f;
    }
    //Detect overlong encodings as errors to prevent vulnerabilities
    switch(additionalBytes)
    {
        case 1:
            if(c<0x80) return invalid;
            break;
        case 2:
            if(c<0x800) return invalid;
            break;
        case 3:
            if(c<0x10000) return invalid;
            break;
    }

    //Reserved space for surrogate pairs in utf16 are invalid code points
    if(c>=0xd800 && c<= 0xdfff) return invalid;
    //Unicode is limited in the range 0-0x10ffff
    if(c>0x10ffff) return invalid;
    return c;
}

} //namespace miosix

#endif //UNICODE_H