/src/libcdr/src/lib/libcdr_utils.cpp

Source
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/*
 * This file is part of the libcdr project.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */

#include "libcdr_utils.h"

#include <cassert>
#include <cstdarg>
#include <cstdio>
#include <string.h>

#include <unicode/ucsdet.h>
#include <unicode/ucnv.h>
#include <unicode/utypes.h>
#include <unicode/utf8.h>

#define CDR_NUM_ELEMENTS(array) sizeof(array)/sizeof(array[0])

#define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)

namespace
{

static unsigned short getEncodingFromICUName(const char *name)
{
  // ANSI
  if (strcmp(name, "ISO-8859-1") == 0)
    return 0;
  if (strcmp(name, "windows-1252") == 0)
    return 0;
  // CENTRAL EUROPE
  if (strcmp(name, "ISO-8859-2") == 0)
    return 0xee;
  if (strcmp(name, "windows-1250") == 0)
    return 0xee;
  // RUSSIAN
  if (strcmp(name, "ISO-8859-5") == 0)
    return 0xcc;
  if (strcmp(name, "windows-1251") == 0)
    return 0xcc;
  if (strcmp(name, "KOI8-R") == 0)
    return 0xcc;
  // ARABIC
  if (strcmp(name, "ISO-8859-6") == 0)
    return 0xb2;
  if (strcmp(name, "windows-1256") == 0)
    return 0xb2;
  // TURKISH
  if (strcmp(name, "ISO-8859-9") == 0)
    return 0xa2;
  if (strcmp(name, "windows-1254") == 0)
    return 0xa2;
  // GREEK
  if (strcmp(name, "ISO-8859-7") == 0)
    return 0xa1;
  if (strcmp(name, "windows-1253") == 0)
    return 0xa1;
  // HEBREW
  if (strcmp(name, "ISO-8859-8") == 0)
    return 0xb1;
  if (strcmp(name, "windows-1255") == 0)
    return 0xb1;
  // JAPANESE
  if (strcmp(name, "Shift_JIS") == 0)
    return 0x80;
  if (strcmp(name, "ISO-2022-JP") == 0)
    return 0x80;
  if (strcmp(name, "EUC-JP") == 0)
    return 0x80;
  if (strcmp(name, "windows-932") == 0)
    return 0x80;
  // KOREAN
  if (strcmp(name, "ISO-2022-KR") == 0)
    return 0x81;
  if (strcmp(name, "EUC-KR") == 0)
    return 0x81;
  if (strcmp(name, "windows-949") == 0)
    return 0x81;
  // CHINESE SIMPLIFIED
  if (strcmp(name, "ISO-2022-CN") == 0)
    return 0x86;
  if (strcmp(name, "GB18030") == 0)
    return 0x86;
  if (strcmp(name, "windows-936") == 0)
    return 0x86;
  // CHINESE TRADITIONAL
  if (strcmp(name, "Big5") == 0)
    return 0x88;
  if (strcmp(name, "windows-950") == 0)
    return 0x88;

  return 0;
}

static unsigned short getEncoding(const unsigned char *buffer, unsigned long bufferLength)
{
  if (!buffer)
    return 0;
  UErrorCode status = U_ZERO_ERROR;
  UCharsetDetector *csd = nullptr;
  try
  {
    csd = ucsdet_open(&status);
    if (U_FAILURE(status) || !csd)
      return 0;
    ucsdet_enableInputFilter(csd, true);
    ucsdet_setText(csd, (const char *)buffer, (unsigned)bufferLength, &status);
    if (U_FAILURE(status))
      throw libcdr::EncodingException();
    const UCharsetMatch *csm = ucsdet_detect(csd, &status);
    if (U_FAILURE(status) || !csm)
      throw libcdr::EncodingException();
    const char *name = ucsdet_getName(csm, &status);
    if (U_FAILURE(status) || !name)
      throw libcdr::EncodingException();
    int32_t confidence = ucsdet_getConfidence(csm, &status);
    if (U_FAILURE(status))
      throw libcdr::EncodingException();
    CDR_DEBUG_MSG(("UCSDET: getEncoding name %s, confidence %i\n", name, confidence));
    unsigned short encoding = getEncodingFromICUName(name);
    ucsdet_close(csd);
    /* From ICU documentation
     * A confidence value of ten does have a general meaning - it is used
     * for charsets that can represent the input data, but for which there
     * is no other indication that suggests that the charset is the correct
     * one. Pure 7 bit ASCII data, for example, is compatible with a great
     * many charsets, most of which will appear as possible matches with
     * a confidence of 10.
     */
    if (confidence == 10)
      return 0;
    return encoding;
  }
  catch (const libcdr::EncodingException &)
  {
    ucsdet_close(csd);
    return 0;
  }
}

static void _appendUCS4(librevenge::RVNGString &text, UChar32 ucs4Character)
{
  // Convert carriage returns to new line characters
  // Writerperfect/LibreOffice will replace them by <text:line-break>
  if (ucs4Character == (UChar32) 0x0d)
    ucs4Character = (UChar32) '\n';

  unsigned char outbuf[U8_MAX_LENGTH+1];
  int i = 0;
  U8_APPEND_UNSAFE(&outbuf[0], i, ucs4Character);
  outbuf[i] = 0;

  text.append((char *)outbuf);
}

} // anonymous namespace

uint8_t libcdr::readU8(librevenge::RVNGInputStream *input, bool /* bigEndian */)
{
  if (!input || input->isEnd())
  {
    CDR_DEBUG_MSG(("Throwing EndOfStreamException\n"));
    throw EndOfStreamException();
  }
  unsigned long numBytesRead;
  uint8_t const *p = input->read(sizeof(uint8_t), numBytesRead);

  if (p && numBytesRead == sizeof(uint8_t))
    return *(uint8_t const *)(p);
  CDR_DEBUG_MSG(("Throwing EndOfStreamException\n"));
  throw EndOfStreamException();
}

uint16_t libcdr::readU16(librevenge::RVNGInputStream *input, bool bigEndian)
{
  if (!input || input->isEnd())
  {
    CDR_DEBUG_MSG(("Throwing EndOfStreamException\n"));
    throw EndOfStreamException();
  }
  unsigned long numBytesRead;
  uint8_t const *p = input->read(sizeof(uint16_t), numBytesRead);

  if (p && numBytesRead == sizeof(uint16_t))
  {
    if (bigEndian)
      return (uint16_t)(p[1]|((uint16_t)p[0]<<8));
    return (uint16_t)(p[0]|((uint16_t)p[1]<<8));
  }
  CDR_DEBUG_MSG(("Throwing EndOfStreamException\n"));
  throw EndOfStreamException();
}

int16_t libcdr::readS16(librevenge::RVNGInputStream *input, bool bigEndian)
{
  return (int16_t)readU16(input, bigEndian);
}

uint32_t libcdr::readU32(librevenge::RVNGInputStream *input, bool bigEndian)
{
  if (!input || input->isEnd())
  {
    CDR_DEBUG_MSG(("Throwing EndOfStreamException\n"));
    throw EndOfStreamException();
  }
  unsigned long numBytesRead;
  uint8_t const *p = input->read(sizeof(uint32_t), numBytesRead);

  if (p && numBytesRead == sizeof(uint32_t))
  {
    if (bigEndian)
      return (uint32_t)p[3]|((uint32_t)p[2]<<8)|((uint32_t)p[1]<<16)|((uint32_t)p[0]<<24);
    return (uint32_t)p[0]|((uint32_t)p[1]<<8)|((uint32_t)p[2]<<16)|((uint32_t)p[3]<<24);
  }
  CDR_DEBUG_MSG(("Throwing EndOfStreamException\n"));
  throw EndOfStreamException();
}

int32_t libcdr::readS32(librevenge::RVNGInputStream *input, bool bigEndian)
{
  return (int32_t)readU32(input, bigEndian);
}

uint64_t libcdr::readU64(librevenge::RVNGInputStream *input, bool bigEndian)
{
  if (!input || input->isEnd())
  {
    CDR_DEBUG_MSG(("Throwing EndOfStreamException\n"));
    throw EndOfStreamException();
  }
  unsigned long numBytesRead;
  uint8_t const *p = input->read(sizeof(uint64_t), numBytesRead);

  if (p && numBytesRead == sizeof(uint64_t))
  {
    if (bigEndian)
      return (uint64_t)p[7]|((uint64_t)p[6]<<8)|((uint64_t)p[5]<<16)|((uint64_t)p[4]<<24)|((uint64_t)p[3]<<32)|((uint64_t)p[2]<<40)|((uint64_t)p[1]<<48)|((uint64_t)p[0]<<56);
    return (uint64_t)p[0]|((uint64_t)p[1]<<8)|((uint64_t)p[2]<<16)|((uint64_t)p[3]<<24)|((uint64_t)p[4]<<32)|((uint64_t)p[5]<<40)|((uint64_t)p[6]<<48)|((uint64_t)p[7]<<56);
  }
  CDR_DEBUG_MSG(("Throwing EndOfStreamException\n"));
  throw EndOfStreamException();
}

double libcdr::readDouble(librevenge::RVNGInputStream *input, bool bigEndian)
{
  union
  {
    uint64_t u;
    double d;
  } tmpUnion;

  tmpUnion.u = readU64(input, bigEndian);

  return tmpUnion.d;
}

double libcdr::readFixedPoint(librevenge::RVNGInputStream *input, bool bigEndian)
{
  unsigned fixedPointNumber = readU32(input, bigEndian);
  auto fixedPointNumberIntegerPart = (short)((fixedPointNumber & 0xFFFF0000) >> 16);
  auto fixedPointNumberFractionalPart = (double)((double)(fixedPointNumber & 0x0000FFFF)/(double)0xFFFF);
  return ((double)fixedPointNumberIntegerPart + fixedPointNumberFractionalPart);
}

unsigned long libcdr::getLength(librevenge::RVNGInputStream *const input)
{
  if (!input)
    throw EndOfStreamException();

  const long orig = input->tell();
  long end = 0;

  if (input->seek(0, librevenge::RVNG_SEEK_END) == 0)
  {
    end = input->tell();
  }
  else
  {
    // RVNG_SEEK_END does not work. Use the harder way.
    if (input->seek(0, librevenge::RVNG_SEEK_SET) != 0)
      throw EndOfStreamException();
    while (!input->isEnd())
    {
      readU8(input);
      ++end;
    }
  }
  assert(end >= 0);

  if (input->seek(orig, librevenge::RVNG_SEEK_SET) != 0)
    throw EndOfStreamException();

  return static_cast<unsigned long>(end);
}

unsigned long libcdr::getRemainingLength(librevenge::RVNGInputStream *const input)
{
  return getLength(input) - static_cast<unsigned long>(input->tell());
}

int libcdr::cdr_round(double d)
{
  return (d>0) ? int(d+0.5) : int(d-0.5);
}

void libcdr::writeU16(librevenge::RVNGBinaryData &buffer, const int value)
{
  buffer.append((unsigned char)(value & 0xFF));
  buffer.append((unsigned char)((value >> 8) & 0xFF));
}

void libcdr::writeU32(librevenge::RVNGBinaryData &buffer, const int value)
{
  buffer.append((unsigned char)(value & 0xFF));
  buffer.append((unsigned char)((value >> 8) & 0xFF));
  buffer.append((unsigned char)((value >> 16) & 0xFF));
  buffer.append((unsigned char)((value >> 24) & 0xFF));
}

void libcdr::appendCharacters(librevenge::RVNGString &text, std::vector<unsigned char> characters, unsigned short charset)
{
  if (characters.empty())
    return;
  static const UChar32 symbolmap [] =
  {
    0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D, // 0x20 ..
    0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F,
    0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
    0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
    0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393,
    0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F,
    0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9,
    0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F,
    0xF8E5, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3,
    0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF,
    0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9,
    0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x0020, // .. 0x7F
    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
    0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
    0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
    0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009E, 0x009f,
    0x20AC, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663, // 0xA0 ..
    0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
    0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x2022,
    0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0x23D0, 0x23AF, 0x21B5,
    0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
    0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
    0x2220, 0x2207, 0x00AE, 0x00A9, 0x2122, 0x220F, 0x221A, 0x22C5,
    0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3,
    0x25CA, 0x3008, 0x00AE, 0x00A9, 0x2122, 0x2211, 0x239B, 0x239C,
    0x239D, 0x23A1, 0x23A2, 0x23A3, 0x23A7, 0x23A8, 0x23A9, 0x23AA,
    0xF8FF, 0x3009, 0x222B, 0x2320, 0x23AE, 0x2321, 0x239E, 0x239F,
    0x23A0, 0x23A4, 0x23A5, 0x23A6, 0x23AB, 0x23AC, 0x23AD, 0x0020  // .. 0xFE
  };

  if (!charset && !characters.empty())
    charset = getEncoding(&characters[0], characters.size());

  if (charset == 0x02) // SYMBOL
  {
    uint32_t ucs4Character = 0;
    for (std::vector<unsigned char>::const_iterator iter = characters.begin();
         iter != characters.end(); ++iter)
    {
      if (*iter < 0x20)
        ucs4Character = 0x20;
      else
        ucs4Character = symbolmap[*iter - 0x20];
      _appendUCS4(text, ucs4Character);
    }
  }
  else
  {
    UErrorCode status = U_ZERO_ERROR;
    UConverter *conv = nullptr;
    switch (charset)
    {
    case 0x80: // SHIFTJIS
      conv = ucnv_open("windows-932", &status);
      break;
    case 0x81: // HANGUL
      conv = ucnv_open("windows-949", &status);
      break;
    case 0x86: // GB2312
      conv = ucnv_open("windows-936", &status);
      break;
    case 0x88: // CHINESEBIG5
      conv = ucnv_open("windows-950", &status);
      break;
    case 0xa1: // GREEEK
      conv = ucnv_open("windows-1253", &status);
      break;
    case 0xa2: // TURKISH
      conv = ucnv_open("windows-1254", &status);
      break;
    case 0xa3: // VIETNAMESE
      conv = ucnv_open("windows-1258", &status);
      break;
    case 0xb1: // HEBREW
      conv = ucnv_open("windows-1255", &status);
      break;
    case 0xb2: // ARABIC
      conv = ucnv_open("windows-1256", &status);
      break;
    case 0xba: // BALTIC
      conv = ucnv_open("windows-1257", &status);
      break;
    case 0xcc: // RUSSIAN
      conv = ucnv_open("windows-1251", &status);
      break;
    case 0xde: // THAI
      conv = ucnv_open("windows-874", &status);
      break;
    case 0xee: // CENTRAL EUROPE
      conv = ucnv_open("windows-1250", &status);
      break;
    default:
      conv = ucnv_open("windows-1252", &status);
      break;
    }
    if (U_SUCCESS(status) && conv)
    {
      const auto *src = (const char *)&characters[0];
      const char *srcLimit = (const char *)src + characters.size();
      while (src < srcLimit)
      {
        UChar32 ucs4Character = ucnv_getNextUChar(conv, &src, srcLimit, &status);
        if (U_SUCCESS(status) && U_IS_UNICODE_CHAR(ucs4Character))
          _appendUCS4(text, ucs4Character);
      }
    }
    if (conv)
      ucnv_close(conv);
  }
}

void libcdr::appendCharacters(librevenge::RVNGString &text, std::vector<unsigned char> characters)
{
  if (characters.empty())
    return;

  UErrorCode status = U_ZERO_ERROR;
  UConverter *conv = ucnv_open("UTF-16LE", &status);

  if (U_SUCCESS(status) && conv)
  {
    const auto *src = (const char *)&characters[0];
    const char *srcLimit = (const char *)src + characters.size();
    while (src < srcLimit)
    {
      UChar32 ucs4Character = ucnv_getNextUChar(conv, &src, srcLimit, &status);
      if (U_SUCCESS(status) && U_IS_UNICODE_CHAR(ucs4Character))
        _appendUCS4(text, ucs4Character);
    }
  }
  if (conv)
    ucnv_close(conv);
}

void libcdr::appendUTF8Characters(librevenge::RVNGString &text, std::vector<unsigned char> characters)
{
  if (characters.empty())
    return;

  for (std::vector<unsigned char>::const_iterator iter = characters.begin(); iter != characters.end(); ++iter)
    text.append((char)*iter);
}

#ifdef DEBUG

void libcdr::debugPrint(const char *const format, ...)
{
  va_list args;
  va_start(args, format);
  std::vfprintf(stderr, format, args);
  va_end(args);
}

const char *libcdr::toFourCC(unsigned value, bool bigEndian)
{
  static char sValue[5] = { 0, 0, 0, 0, 0 };
  if (bigEndian)
  {
    sValue[3] = (char)(value & 0xff);
    sValue[2] = (char)((value & 0xff00) >> 8);
    sValue[1] = (char)((value & 0xff0000) >> 16);
    sValue[0] = (char)((value & 0xff000000) >> 24);
  }
  else
  {
    sValue[0] = (char)(value & 0xff);
    sValue[1] = (char)((value & 0xff00) >> 8);
    sValue[2] = (char)((value & 0xff0000) >> 16);
    sValue[3] = (char)((value & 0xff000000) >> 24);
  }
  return sValue;
}
#endif

/* vim:set shiftwidth=2 softtabstop=2 expandtab: */

Coverage Report

Created: 2026-06-13 06:44

Line	Count	Source
1		/* -- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -- */
2		/*
3		* This file is part of the libcdr project.
4		*
5		* This Source Code Form is subject to the terms of the Mozilla Public
6		* License, v. 2.0. If a copy of the MPL was not distributed with this
7		* file, You can obtain one at http://mozilla.org/MPL/2.0/.
8		*/
9
10		#include "libcdr_utils.h"
11
12		#include <cassert>
13		#include <cstdarg>
14		#include <cstdio>
15		#include <string.h>
16
17		#include <unicode/ucsdet.h>
18		#include <unicode/ucnv.h>
19		#include <unicode/utypes.h>
20		#include <unicode/utf8.h>
21
22		#define CDR_NUM_ELEMENTS(array) sizeof(array)/sizeof(array[0])
23
24		#define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)
25
26		namespace
27		{
28
29		static unsigned short getEncodingFromICUName(const char *name)
30	588k	{
31		// ANSI
32	588k	if (strcmp(name, "ISO-8859-1") == 0)
33	23.8k	return 0;
34	564k	if (strcmp(name, "windows-1252") == 0)
35	2.92k	return 0;
36		// CENTRAL EUROPE
37	561k	if (strcmp(name, "ISO-8859-2") == 0)
38	11.8k	return 0xee;
39	549k	if (strcmp(name, "windows-1250") == 0)
40	1.74k	return 0xee;
41		// RUSSIAN
42	548k	if (strcmp(name, "ISO-8859-5") == 0)
43	2.08k	return 0xcc;
44	546k	if (strcmp(name, "windows-1251") == 0)
45	764	return 0xcc;
46	545k	if (strcmp(name, "KOI8-R") == 0)
47	913	return 0xcc;
48		// ARABIC
49	544k	if (strcmp(name, "ISO-8859-6") == 0)
50	109	return 0xb2;
51	544k	if (strcmp(name, "windows-1256") == 0)
52	50	return 0xb2;
53		// TURKISH
54	544k	if (strcmp(name, "ISO-8859-9") == 0)
55	216	return 0xa2;
56	544k	if (strcmp(name, "windows-1254") == 0)
57	79	return 0xa2;
58		// GREEK
59	543k	if (strcmp(name, "ISO-8859-7") == 0)
60	920	return 0xa1;
61	543k	if (strcmp(name, "windows-1253") == 0)
62	434	return 0xa1;
63		// HEBREW
64	542k	if (strcmp(name, "ISO-8859-8") == 0)
65	753	return 0xb1;
66	541k	if (strcmp(name, "windows-1255") == 0)
67	585	return 0xb1;
68		// JAPANESE
69	541k	if (strcmp(name, "Shift_JIS") == 0)
70	10.3k	return 0x80;
71	530k	if (strcmp(name, "ISO-2022-JP") == 0)
72	7	return 0x80;
73	530k	if (strcmp(name, "EUC-JP") == 0)
74	4.59k	return 0x80;
75	526k	if (strcmp(name, "windows-932") == 0)
76	0	return 0x80;
77		// KOREAN
78	526k	if (strcmp(name, "ISO-2022-KR") == 0)
79	0	return 0x81;
80	526k	if (strcmp(name, "EUC-KR") == 0)
81	49	return 0x81;
82	526k	if (strcmp(name, "windows-949") == 0)
83	0	return 0x81;
84		// CHINESE SIMPLIFIED
85	526k	if (strcmp(name, "ISO-2022-CN") == 0)
86	1.39k	return 0x86;
87	524k	if (strcmp(name, "GB18030") == 0)
88	17.3k	return 0x86;
89	507k	if (strcmp(name, "windows-936") == 0)
90	0	return 0x86;
91		// CHINESE TRADITIONAL
92	507k	if (strcmp(name, "Big5") == 0)
93	209	return 0x88;
94	507k	if (strcmp(name, "windows-950") == 0)
95	0	return 0x88;
96
97	507k	return 0;
98	507k	}
99
100		static unsigned short getEncoding(const unsigned char *buffer, unsigned long bufferLength)
101	629k	{
102	629k	if (!buffer)
103	0	return 0;
104	629k	UErrorCode status = U_ZERO_ERROR;
105	629k	UCharsetDetector *csd = nullptr;
106	629k	try
107	629k	{
108	629k	csd = ucsdet_open(&status);
109	629k	if (U_FAILURE(status) \|\| !csd)
110	0	return 0;
111	629k	ucsdet_enableInputFilter(csd, true);
112	629k	ucsdet_setText(csd, (const char *)buffer, (unsigned)bufferLength, &status);
113	629k	if (U_FAILURE(status))
114	0	throw libcdr::EncodingException();
115	629k	const UCharsetMatch *csm = ucsdet_detect(csd, &status);
116	629k	if (U_FAILURE(status) \|\| !csm)
117	40.4k	throw libcdr::EncodingException();
118	588k	const char *name = ucsdet_getName(csm, &status);
119	588k	if (U_FAILURE(status) \|\| !name)
120	0	throw libcdr::EncodingException();
121	588k	int32_t confidence = ucsdet_getConfidence(csm, &status);
122	588k	if (U_FAILURE(status))
123	0	throw libcdr::EncodingException();
124	588k	CDR_DEBUG_MSG(("UCSDET: getEncoding name %s, confidence %i\n", name, confidence));
125	588k	unsigned short encoding = getEncodingFromICUName(name);
126	588k	ucsdet_close(csd);
127		/* From ICU documentation
128		* A confidence value of ten does have a general meaning - it is used
129		* for charsets that can represent the input data, but for which there
130		* is no other indication that suggests that the charset is the correct
131		* one. Pure 7 bit ASCII data, for example, is compatible with a great
132		* many charsets, most of which will appear as possible matches with
133		* a confidence of 10.
134		*/
135	588k	if (confidence == 10)
136	49.9k	return 0;
137	538k	return encoding;
138	588k	}
139	629k	catch (const libcdr::EncodingException &)
140	629k	{
141	40.4k	ucsdet_close(csd);
142	40.4k	return 0;
143	40.4k	}
144	629k	}
145
146		static void _appendUCS4(librevenge::RVNGString &text, UChar32 ucs4Character)
147	50.4M	{
148		// Convert carriage returns to new line characters
149		// Writerperfect/LibreOffice will replace them by <text:line-break>
150	50.4M	if (ucs4Character == (UChar32) 0x0d)
151	17.0k	ucs4Character = (UChar32) '\n';
152
153	50.4M	unsigned char outbuf[U8_MAX_LENGTH+1];
154	50.4M	int i = 0;
155	50.4M	U8_APPEND_UNSAFE(&outbuf[0], i, ucs4Character);
156	50.4M	outbuf[i] = 0;
157
158	50.4M	text.append((char *)outbuf);
159	50.4M	}
160
161		} // anonymous namespace
162
163		uint8_t libcdr::readU8(librevenge::RVNGInputStream input, bool / bigEndian */)
164	298M	{
165	298M	if (!input \|\| input->isEnd())
166	20.9k	{
167	20.9k	CDR_DEBUG_MSG(("Throwing EndOfStreamException\n"));
168	20.9k	throw EndOfStreamException();
169	20.9k	}
170	298M	unsigned long numBytesRead;
171	298M	uint8_t const *p = input->read(sizeof(uint8_t), numBytesRead);
172
173	298M	if (p && numBytesRead == sizeof(uint8_t))
174	298M	return (uint8_t const )(p);
175	0	CDR_DEBUG_MSG(("Throwing EndOfStreamException\n"));
176	0	throw EndOfStreamException();
177	298M	}
178
179		uint16_t libcdr::readU16(librevenge::RVNGInputStream *input, bool bigEndian)
180	778M	{
181	778M	if (!input \|\| input->isEnd())
182	14.0k	{
183	14.0k	CDR_DEBUG_MSG(("Throwing EndOfStreamException\n"));
184	14.0k	throw EndOfStreamException();
185	14.0k	}
186	777M	unsigned long numBytesRead;
187	777M	uint8_t const *p = input->read(sizeof(uint16_t), numBytesRead);
188
189	777M	if (p && numBytesRead == sizeof(uint16_t))
190	777M	{
191	777M	if (bigEndian)
192	550	return (uint16_t)(p[1]\|((uint16_t)p[0]<<8));
193	777M	return (uint16_t)(p[0]\|((uint16_t)p[1]<<8));
194	777M	}
195	10.4k	CDR_DEBUG_MSG(("Throwing EndOfStreamException\n"));
196	10.4k	throw EndOfStreamException();
197	777M	}
198
199		int16_t libcdr::readS16(librevenge::RVNGInputStream *input, bool bigEndian)
200	79.5M	{
201	79.5M	return (int16_t)readU16(input, bigEndian);
202	79.5M	}
203
204		uint32_t libcdr::readU32(librevenge::RVNGInputStream *input, bool bigEndian)
205	234M	{
206	234M	if (!input \|\| input->isEnd())
207	56.4k	{
208	56.4k	CDR_DEBUG_MSG(("Throwing EndOfStreamException\n"));
209	56.4k	throw EndOfStreamException();
210	56.4k	}
211	234M	unsigned long numBytesRead;
212	234M	uint8_t const *p = input->read(sizeof(uint32_t), numBytesRead);
213
214	234M	if (p && numBytesRead == sizeof(uint32_t))
215	234M	{
216	234M	if (bigEndian)
217	1.06k	return (uint32_t)p[3]\|((uint32_t)p[2]<<8)\|((uint32_t)p[1]<<16)\|((uint32_t)p[0]<<24);
218	234M	return (uint32_t)p[0]\|((uint32_t)p[1]<<8)\|((uint32_t)p[2]<<16)\|((uint32_t)p[3]<<24);
219	234M	}
220	23.6k	CDR_DEBUG_MSG(("Throwing EndOfStreamException\n"));
221	23.6k	throw EndOfStreamException();
222	234M	}
223
224		int32_t libcdr::readS32(librevenge::RVNGInputStream *input, bool bigEndian)
225	107M	{
226	107M	return (int32_t)readU32(input, bigEndian);
227	107M	}
228
229		uint64_t libcdr::readU64(librevenge::RVNGInputStream *input, bool bigEndian)
230	15.9M	{
231	15.9M	if (!input \|\| input->isEnd())
232	40	{
233	40	CDR_DEBUG_MSG(("Throwing EndOfStreamException\n"));
234	40	throw EndOfStreamException();
235	40	}
236	15.9M	unsigned long numBytesRead;
237	15.9M	uint8_t const *p = input->read(sizeof(uint64_t), numBytesRead);
238
239	15.9M	if (p && numBytesRead == sizeof(uint64_t))
240	15.9M	{
241	15.9M	if (bigEndian)
242	131	return (uint64_t)p[7]\|((uint64_t)p[6]<<8)\|((uint64_t)p[5]<<16)\|((uint64_t)p[4]<<24)\|((uint64_t)p[3]<<32)\|((uint64_t)p[2]<<40)\|((uint64_t)p[1]<<48)\|((uint64_t)p[0]<<56);
243	15.9M	return (uint64_t)p[0]\|((uint64_t)p[1]<<8)\|((uint64_t)p[2]<<16)\|((uint64_t)p[3]<<24)\|((uint64_t)p[4]<<32)\|((uint64_t)p[5]<<40)\|((uint64_t)p[6]<<48)\|((uint64_t)p[7]<<56);
244	15.9M	}
245	62	CDR_DEBUG_MSG(("Throwing EndOfStreamException\n"));
246	62	throw EndOfStreamException();
247	15.9M	}
248
249		double libcdr::readDouble(librevenge::RVNGInputStream *input, bool bigEndian)
250	7.19M	{
251	7.19M	union
252	7.19M	{
253	7.19M	uint64_t u;
254	7.19M	double d;
255	7.19M	} tmpUnion;
256
257	7.19M	tmpUnion.u = readU64(input, bigEndian);
258
259	7.19M	return tmpUnion.d;
260	7.19M	}
261
262		double libcdr::readFixedPoint(librevenge::RVNGInputStream *input, bool bigEndian)
263	81.5k	{
264	81.5k	unsigned fixedPointNumber = readU32(input, bigEndian);
265	81.5k	auto fixedPointNumberIntegerPart = (short)((fixedPointNumber & 0xFFFF0000) >> 16);
266	81.5k	auto fixedPointNumberFractionalPart = (double)((double)(fixedPointNumber & 0x0000FFFF)/(double)0xFFFF);
267	81.5k	return ((double)fixedPointNumberIntegerPart + fixedPointNumberFractionalPart);
268	81.5k	}
269
270		unsigned long libcdr::getLength(librevenge::RVNGInputStream *const input)
271	7.84M	{
272	7.84M	if (!input)
273	0	throw EndOfStreamException();
274
275	7.84M	const long orig = input->tell();
276	7.84M	long end = 0;
277
278	7.84M	if (input->seek(0, librevenge::RVNG_SEEK_END) == 0)
279	7.84M	{
280	7.84M	end = input->tell();
281	7.84M	}
282	0	else
283	0	{
284		// RVNG_SEEK_END does not work. Use the harder way.
285	0	if (input->seek(0, librevenge::RVNG_SEEK_SET) != 0)
286	0	throw EndOfStreamException();
287	0	while (!input->isEnd())
288	0	{
289	0	readU8(input);
290	0	++end;
291	0	}
292	0	}
293	7.84M	assert(end >= 0);
294
295	7.84M	if (input->seek(orig, librevenge::RVNG_SEEK_SET) != 0)
296	0	throw EndOfStreamException();
297
298	7.84M	return static_cast<unsigned long>(end);
299	7.84M	}
300
301		unsigned long libcdr::getRemainingLength(librevenge::RVNGInputStream *const input)
302	7.70M	{
303	7.70M	return getLength(input) - static_cast<unsigned long>(input->tell());
304	7.70M	}
305
306		int libcdr::cdr_round(double d)
307	5.55M	{
308	5.55M	return (d>0) ? int(d+0.5) : int(d-0.5);
309	5.55M	}
310
311		void libcdr::writeU16(librevenge::RVNGBinaryData &buffer, const int value)
312	85.2k	{
313	85.2k	buffer.append((unsigned char)(value & 0xFF));
314	85.2k	buffer.append((unsigned char)((value >> 8) & 0xFF));
315	85.2k	}
316
317		void libcdr::writeU32(librevenge::RVNGBinaryData &buffer, const int value)
318	10.8M	{
319	10.8M	buffer.append((unsigned char)(value & 0xFF));
320	10.8M	buffer.append((unsigned char)((value >> 8) & 0xFF));
321	10.8M	buffer.append((unsigned char)((value >> 16) & 0xFF));
322	10.8M	buffer.append((unsigned char)((value >> 24) & 0xFF));
323	10.8M	}
324
325		void libcdr::appendCharacters(librevenge::RVNGString &text, std::vector<unsigned char> characters, unsigned short charset)
326	3.65M	{
327	3.65M	if (characters.empty())
328	6.24k	return;
329	3.64M	static const UChar32 symbolmap [] =
330	3.64M	{
331	3.64M	0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D, // 0x20 ..
332	3.64M	0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F,
333	3.64M	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
334	3.64M	0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
335	3.64M	0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393,
336	3.64M	0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F,
337	3.64M	0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9,
338	3.64M	0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F,
339	3.64M	0xF8E5, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3,
340	3.64M	0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF,
341	3.64M	0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9,
342	3.64M	0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x0020, // .. 0x7F
343	3.64M	0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
344	3.64M	0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
345	3.64M	0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
346	3.64M	0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009E, 0x009f,
347	3.64M	0x20AC, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663, // 0xA0 ..
348	3.64M	0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
349	3.64M	0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x2022,
350	3.64M	0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0x23D0, 0x23AF, 0x21B5,
351	3.64M	0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
352	3.64M	0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
353	3.64M	0x2220, 0x2207, 0x00AE, 0x00A9, 0x2122, 0x220F, 0x221A, 0x22C5,
354	3.64M	0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3,
355	3.64M	0x25CA, 0x3008, 0x00AE, 0x00A9, 0x2122, 0x2211, 0x239B, 0x239C,
356	3.64M	0x239D, 0x23A1, 0x23A2, 0x23A3, 0x23A7, 0x23A8, 0x23A9, 0x23AA,
357	3.64M	0xF8FF, 0x3009, 0x222B, 0x2320, 0x23AE, 0x2321, 0x239E, 0x239F,
358	3.64M	0x23A0, 0x23A4, 0x23A5, 0x23A6, 0x23AB, 0x23AC, 0x23AD, 0x0020 // .. 0xFE
359	3.64M	};
360
361	3.64M	if (!charset && !characters.empty())
362	629k	charset = getEncoding(&characters[0], characters.size());
363
364	3.64M	if (charset == 0x02) // SYMBOL
365	1.20k	{
366	1.20k	uint32_t ucs4Character = 0;
367	1.20k	for (std::vector<unsigned char>::const_iterator iter = characters.begin();
368	5.45k	iter != characters.end(); ++iter)
369	4.25k	{
370	4.25k	if (*iter < 0x20)
371	1.38k	ucs4Character = 0x20;
372	2.86k	else
373	2.86k	ucs4Character = symbolmap[*iter - 0x20];
374	4.25k	_appendUCS4(text, ucs4Character);
375	4.25k	}
376	1.20k	}
377	3.64M	else
378	3.64M	{
379	3.64M	UErrorCode status = U_ZERO_ERROR;
380	3.64M	UConverter *conv = nullptr;
381	3.64M	switch (charset)
382	3.64M	{
383	613	case 0x80: // SHIFTJIS
384	613	conv = ucnv_open("windows-932", &status);
385	613	break;
386	868	case 0x81: // HANGUL
387	868	conv = ucnv_open("windows-949", &status);
388	868	break;
389	1.76k	case 0x86: // GB2312
390	1.76k	conv = ucnv_open("windows-936", &status);
391	1.76k	break;
392	2.31k	case 0x88: // CHINESEBIG5
393	2.31k	conv = ucnv_open("windows-950", &status);
394	2.31k	break;
395	1.55k	case 0xa1: // GREEEK
396	1.55k	conv = ucnv_open("windows-1253", &status);
397	1.55k	break;
398	1.60k	case 0xa2: // TURKISH
399	1.60k	conv = ucnv_open("windows-1254", &status);
400	1.60k	break;
401	1.68k	case 0xa3: // VIETNAMESE
402	1.68k	conv = ucnv_open("windows-1258", &status);
403	1.68k	break;
404	1.43k	case 0xb1: // HEBREW
405	1.43k	conv = ucnv_open("windows-1255", &status);
406	1.43k	break;
407	365	case 0xb2: // ARABIC
408	365	conv = ucnv_open("windows-1256", &status);
409	365	break;
410	214	case 0xba: // BALTIC
411	214	conv = ucnv_open("windows-1257", &status);
412	214	break;
413	5.22k	case 0xcc: // RUSSIAN
414	5.22k	conv = ucnv_open("windows-1251", &status);
415	5.22k	break;
416	348	case 0xde: // THAI
417	348	conv = ucnv_open("windows-874", &status);
418	348	break;
419	13.7k	case 0xee: // CENTRAL EUROPE
420	13.7k	conv = ucnv_open("windows-1250", &status);
421	13.7k	break;
422	3.61M	default:
423	3.61M	conv = ucnv_open("windows-1252", &status);
424	3.61M	break;
425	3.64M	}
426	3.64M	if (U_SUCCESS(status) && conv)
427	3.64M	{
428	3.64M	const auto src = (const char )&characters[0];
429	3.64M	const char srcLimit = (const char )src + characters.size();
430	16.1M	while (src < srcLimit)
431	12.4M	{
432	12.4M	UChar32 ucs4Character = ucnv_getNextUChar(conv, &src, srcLimit, &status);
433	12.4M	if (U_SUCCESS(status) && U_IS_UNICODE_CHAR(ucs4Character))
434	12.4M	_appendUCS4(text, ucs4Character);
435	12.4M	}
436	3.64M	}
437	3.64M	if (conv)
438	3.64M	ucnv_close(conv);
439	3.64M	}
440	3.64M	}
441
442		void libcdr::appendCharacters(librevenge::RVNGString &text, std::vector<unsigned char> characters)
443	1.43M	{
444	1.43M	if (characters.empty())
445	11.4k	return;
446
447	1.42M	UErrorCode status = U_ZERO_ERROR;
448	1.42M	UConverter *conv = ucnv_open("UTF-16LE", &status);
449
450	1.42M	if (U_SUCCESS(status) && conv)
451	1.42M	{
452	1.42M	const auto src = (const char )&characters[0];
453	1.42M	const char srcLimit = (const char )src + characters.size();
454	39.8M	while (src < srcLimit)
455	38.4M	{
456	38.4M	UChar32 ucs4Character = ucnv_getNextUChar(conv, &src, srcLimit, &status);
457	38.4M	if (U_SUCCESS(status) && U_IS_UNICODE_CHAR(ucs4Character))
458	37.9M	_appendUCS4(text, ucs4Character);
459	38.4M	}
460	1.42M	}
461	1.42M	if (conv)
462	1.42M	ucnv_close(conv);
463	1.42M	}
464
465		void libcdr::appendUTF8Characters(librevenge::RVNGString &text, std::vector<unsigned char> characters)
466	33.3k	{
467	33.3k	if (characters.empty())
468	24.3k	return;
469
470	210M	for (std::vector<unsigned char>::const_iterator iter = characters.begin(); iter != characters.end(); ++iter)
471	210M	text.append((char)*iter);
472	9.03k	}
473
474		#ifdef DEBUG
475
476		void libcdr::debugPrint(const char *const format, ...)
477		{
478		va_list args;
479		va_start(args, format);
480		std::vfprintf(stderr, format, args);
481		va_end(args);
482		}
483
484		const char *libcdr::toFourCC(unsigned value, bool bigEndian)
485		{
486		static char sValue[5] = { 0, 0, 0, 0, 0 };
487		if (bigEndian)
488		{
489		sValue[3] = (char)(value & 0xff);
490		sValue[2] = (char)((value & 0xff00) >> 8);
491		sValue[1] = (char)((value & 0xff0000) >> 16);
492		sValue[0] = (char)((value & 0xff000000) >> 24);
493		}
494		else
495		{
496		sValue[0] = (char)(value & 0xff);
497		sValue[1] = (char)((value & 0xff00) >> 8);
498		sValue[2] = (char)((value & 0xff0000) >> 16);
499		sValue[3] = (char)((value & 0xff000000) >> 24);
500		}
501		return sValue;
502		}
503		#endif
504
505		/* vim:set shiftwidth=2 softtabstop=2 expandtab: */