Coverage Report

Created: 2026-04-29 07:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libe-book/src/lib/TealDocParser.cpp
Line
Count
Source
1
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
/*
3
 * This file is part of the libe-book project.
4
 *
5
 * This Source Code Form is subject to the terms of the Mozilla Public
6
 * License, v. 2.0. If a copy of the MPL was not distributed with this
7
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8
 */
9
10
#include <algorithm>
11
#include <cassert>
12
#include <cstring>
13
#include <deque>
14
#include <utility>
15
16
#include <boost/fusion/include/std_pair.hpp>
17
#include <boost/optional.hpp>
18
#include <boost/spirit/include/qi.hpp>
19
#include <boost/variant.hpp>
20
21
#include <librevenge/librevenge.h>
22
23
#include "libebook_utils.h"
24
#include "EBOOKCharsetConverter.h"
25
#include "EBOOKMemoryStream.h"
26
#include "EBOOKUTF8Stream.h"
27
#include "TealDocParser.h"
28
#include "PDBLZ77Stream.h"
29
30
using boost::optional;
31
32
using std::deque;
33
using std::string;
34
using std::vector;
35
36
namespace libebook
37
{
38
39
namespace
40
{
41
42
static const unsigned TEALDOC_BLOCK_SIZE = 4096;
43
44
static const unsigned TEALDOC_TYPE = PDB_CODE("TEXt");
45
static const unsigned TEALDOC_CREATOR = PDB_CODE("TlDc");
46
47
}
48
49
namespace
50
{
51
52
enum Font
53
{
54
  FONT_NORMAL,
55
  FONT_BOLD,
56
  FONT_LARGE
57
};
58
59
enum Style
60
{
61
  STYLE_NORMAL,
62
  STYLE_UNDERLINE,
63
  STYLE_INVERT
64
};
65
66
enum Align
67
{
68
  ALIGN_LEFT,
69
  ALIGN_RIGHT,
70
  ALIGN_CENTER
71
};
72
73
enum Token
74
{
75
  TOKEN_UNKNOWN,
76
  // tags
77
  TOKEN_BOOKMARK,
78
  TOKEN_HEADER,
79
  TOKEN_HRULE,
80
  TOKEN_LABEL,
81
  TOKEN_LINK,
82
  TOKEN_TEALPAINT,
83
  // attributes
84
  TOKEN_ALIGN,
85
  TOKEN_FONT,
86
  TOKEN_STYLE,
87
  TOKEN_TEXT,
88
  // values
89
  TOKEN_0,
90
  TOKEN_1,
91
  TOKEN_2,
92
  TOKEN_CENTER,
93
  TOKEN_INVERT,
94
  TOKEN_LEFT,
95
  TOKEN_NORMAL,
96
  TOKEN_RIGHT,
97
  TOKEN_UNDERLINE
98
};
99
100
struct TealDocAttributes
101
{
102
  TealDocAttributes();
103
104
  optional<Font> font;
105
  optional<Style> style;
106
  optional<Align> align;
107
};
108
109
TealDocAttributes::TealDocAttributes()
110
19.0k
  : font()
111
19.0k
  , style()
112
19.0k
  , align()
113
19.0k
{
114
19.0k
}
115
116
}
117
118
namespace
119
{
120
121
namespace qi = boost::spirit::qi;
122
123
typedef boost::variant<int, string> AttrValue_t;
124
typedef deque<std::pair<int, AttrValue_t>> Attributes_t;
125
126
template<typename Iterator>
127
struct TagGrammar : public qi::grammar<Iterator, std::pair<int, Attributes_t>, qi::space_type>
128
{
129
20.9k
  TagGrammar() : TagGrammar::base_type(tag, "tag")
130
20.9k
  {
131
20.9k
    using qi::alnum;
132
20.9k
    using qi::alpha;
133
20.9k
    using qi::attr;
134
20.9k
    using qi::char_;
135
20.9k
    using qi::lexeme;
136
20.9k
    using qi::no_case;
137
20.9k
    using qi::omit;
138
20.9k
    using qi::space;
139
140
20.9k
    tag %= '<' >> (no_case[tags] | unknown) >> *attrib >> '>';
141
142
20.9k
    attrib %=
143
20.9k
      no_case[eattrs] >> '=' >> (values | text) // accept unknown values too
144
20.9k
      | no_case[sattrs] >> '=' >> text
145
20.9k
      | unknown >> '=' >> text
146
20.9k
      ;
147
148
20.9k
    value %=
149
20.9k
      '\'' >> values >> '\''
150
20.9k
      | '\"' >> values >> '\"'
151
20.9k
      | values
152
20.9k
      ;
153
154
20.9k
    text %=
155
20.9k
      '\'' >> lexeme[+(char_ - '\'')] >> '\''
156
20.9k
      | '\"' >> lexeme[+(char_ - '\"')] >> '\"'
157
20.9k
      | lexeme[+(char_ - space)]
158
20.9k
      ;
159
160
20.9k
    unknown %= omit[alpha >> *alnum] >> attr(TOKEN_UNKNOWN);
161
162
20.9k
    tag.name("tag");
163
20.9k
    attrib.name("attrib");
164
20.9k
    value.name("value");
165
20.9k
    text.name("text");
166
20.9k
    unknown.name("unknown");
167
168
20.9k
    tags.name("tags");
169
20.9k
    eattrs.name("eattrs");
170
20.9k
    sattrs.name("sattrs");
171
20.9k
    values.name("values");
172
20.9k
  }
173
174
  struct tags_ : qi::symbols<char, int>
175
  {
176
    tags_()
177
20.9k
    {
178
20.9k
      add
179
20.9k
      ("bookmark", TOKEN_BOOKMARK)
180
20.9k
      ("header", TOKEN_HEADER)
181
20.9k
      ("hrule", TOKEN_HRULE)
182
20.9k
      ("label", TOKEN_LABEL)
183
20.9k
      ("link", TOKEN_LINK)
184
20.9k
      ("tealpaint", TOKEN_TEALPAINT)
185
20.9k
      ;
186
20.9k
    }
187
  } tags;
188
189
  struct eattrs_ : qi::symbols<char, int>
190
  {
191
    eattrs_()
192
20.9k
    {
193
20.9k
      add
194
20.9k
      ("align", TOKEN_ALIGN)
195
20.9k
      ("font", TOKEN_FONT)
196
20.9k
      ("style", TOKEN_STYLE)
197
20.9k
      ;
198
20.9k
    }
199
  } eattrs;
200
201
  struct sattrs_ : qi::symbols<char, int>
202
  {
203
    sattrs_()
204
20.9k
    {
205
20.9k
      add
206
20.9k
      ("text", TOKEN_TEXT)
207
20.9k
      ;
208
20.9k
    }
209
  } sattrs;
210
211
  struct values_ : qi::symbols<char, int>
212
  {
213
    values_()
214
20.9k
    {
215
20.9k
      add
216
20.9k
      ("0", TOKEN_0)
217
20.9k
      ("1", TOKEN_1)
218
20.9k
      ("2", TOKEN_2)
219
20.9k
      ("center", TOKEN_CENTER)
220
20.9k
      ("invert", TOKEN_INVERT)
221
20.9k
      ("left", TOKEN_LEFT)
222
20.9k
      ("normal", TOKEN_NORMAL)
223
20.9k
      ("right", TOKEN_RIGHT)
224
20.9k
      ("underline", TOKEN_UNDERLINE)
225
20.9k
      ;
226
20.9k
    }
227
  } values;
228
229
  qi::rule<Iterator, std::pair<int, Attributes_t>, qi::space_type> tag;
230
  qi::rule<Iterator, std::pair<int, AttrValue_t>, qi::space_type> attrib;
231
  qi::rule<Iterator, AttrValue_t, qi::space_type> value;
232
  qi::rule<Iterator, AttrValue_t, qi::space_type> text;
233
  qi::rule<Iterator, int, qi::space_type> unknown;
234
};
235
236
}
237
238
namespace
239
{
240
241
class AttributeHandler : public boost::static_visitor<>
242
{
243
public:
244
  AttributeHandler(const int tokenId, TealDocAttributes &attributes, string &text)
245
0
    : m_attr(tokenId)
246
0
    , m_attributes(attributes)
247
0
    , m_text(text)
248
0
  {
249
0
  }
250
251
  void operator()(const int value) const
252
0
  {
253
0
    switch (m_attr)
254
0
    {
255
0
    case TOKEN_ALIGN :
256
0
    {
257
0
      switch (value)
258
0
      {
259
0
      case TOKEN_CENTER :
260
0
        m_attributes.align = ALIGN_CENTER;
261
0
        break;
262
0
      case TOKEN_LEFT :
263
0
        m_attributes.align = ALIGN_LEFT;
264
0
        break;
265
0
      case TOKEN_RIGHT :
266
0
        m_attributes.align = ALIGN_RIGHT;
267
0
        break;
268
0
      default :
269
0
        EBOOK_DEBUG_MSG(("unknown alignment %d\n", value));
270
0
      }
271
0
      break;
272
0
    }
273
0
    case TOKEN_FONT :
274
0
    {
275
0
      switch (value)
276
0
      {
277
0
      case TOKEN_0 :
278
0
        m_attributes.font = FONT_NORMAL;
279
0
        break;
280
0
      case TOKEN_1 :
281
0
        m_attributes.font = FONT_BOLD;
282
0
        break;
283
0
      case TOKEN_2 :
284
0
        m_attributes.font = FONT_LARGE;
285
0
        break;
286
0
      default :
287
0
        EBOOK_DEBUG_MSG(("unknown font type %d\n", value));
288
0
      }
289
0
      break;
290
0
    }
291
0
    case TOKEN_STYLE :
292
0
    {
293
0
      switch (value)
294
0
      {
295
0
      case TOKEN_INVERT :
296
0
        m_attributes.style = STYLE_INVERT;
297
0
        break;
298
0
      case TOKEN_NORMAL :
299
0
        m_attributes.style = STYLE_NORMAL;
300
0
        break;
301
0
      case TOKEN_UNDERLINE :
302
0
        m_attributes.style = STYLE_UNDERLINE;
303
0
        break;
304
0
      default :
305
0
        EBOOK_DEBUG_MSG(("unknown style %d\n", value));
306
0
      }
307
0
      break;
308
0
    }
309
0
    default :
310
0
      break;
311
0
    }
312
0
  }
313
314
  void operator()(const std::string &value) const
315
0
  {
316
0
    if (m_attr == TOKEN_TEXT)
317
0
      m_text = value;
318
0
  }
319
320
private:
321
  const int m_attr;
322
  TealDocAttributes &m_attributes;
323
  string &m_text;
324
};
325
326
}
327
328
class TealDocTextParser
329
{
330
  // -Weffc++
331
  TealDocTextParser(const TealDocTextParser &other);
332
  TealDocTextParser operator=(const TealDocTextParser &other);
333
334
public:
335
  explicit TealDocTextParser(librevenge::RVNGTextInterface *document);
336
337
  void parse(librevenge::RVNGInputStream *input, bool last = false);
338
339
private:
340
  bool parseTag(librevenge::RVNGInputStream *input);
341
342
  bool parseHeaderTag(const Attributes_t &attributeList);
343
344
  void openParagraph(const TealDocAttributes &attributes = TealDocAttributes());
345
  void closeParagraph();
346
347
  void finishParagraph();
348
  void flushText(const TealDocAttributes &attributes = TealDocAttributes());
349
350
private:
351
  librevenge::RVNGTextInterface *const m_document;
352
353
  string m_text;
354
355
  bool m_openedParagraph;
356
};
357
358
TealDocTextParser::TealDocTextParser(librevenge::RVNGTextInterface *const document)
359
2.68k
  : m_document(document)
360
2.68k
  , m_text()
361
2.68k
  , m_openedParagraph(false)
362
2.68k
{
363
2.68k
}
364
365
void TealDocTextParser::parse(librevenge::RVNGInputStream *const input, const bool last)
366
3.40k
{
367
986k
  while (!input->isEnd())
368
983k
  {
369
983k
    const unsigned char c = readU8(input);
370
371
983k
    switch (c)
372
983k
    {
373
18.3k
    case '\n' :
374
18.3k
      finishParagraph();
375
18.3k
      break;
376
21.2k
    case '<' :
377
21.2k
      if (!parseTag(input))
378
20.1k
        m_text.push_back('<');
379
21.2k
      break;
380
943k
    default :
381
943k
      m_text.push_back((char) c);
382
983k
    }
383
983k
  }
384
385
3.12k
  if (last)
386
693
    finishParagraph();
387
3.12k
}
388
389
bool TealDocTextParser::parseTag(librevenge::RVNGInputStream *const input)
390
21.2k
{
391
21.2k
  const auto pos = (unsigned long) input->tell();
392
21.2k
  string tag("<");
393
394
  // read tag into string
395
  // I suppose it would be possible to create an iterator adaptor for
396
  // librevenge::RVNGInputStream, but this is much simpler
397
21.2k
  unsigned char c = 0;
398
21.2k
  do
399
12.6M
  {
400
12.6M
    c = readU8(input);
401
12.6M
    tag.push_back((char) c);
402
12.6M
  }
403
12.6M
  while ('>' != c);
404
405
  // parse tag
406
21.2k
  std::pair<int, Attributes_t> parsedTag;
407
408
21.2k
  auto it = tag.cbegin();
409
21.2k
  const bool match = qi::phrase_parse(
410
21.2k
                       it, tag.cend(),
411
21.2k
                       TagGrammar<string::const_iterator>(), qi::space,
412
21.2k
                       parsedTag
413
21.2k
                     );
414
21.2k
  const bool success = match && it == tag.end();
415
416
  // process tag
417
21.2k
  if (success)
418
832
  {
419
832
    if (parsedTag.first == TOKEN_HEADER)
420
0
    {
421
0
      finishParagraph();
422
0
      parseHeaderTag(parsedTag.second);
423
0
    }
424
    /* TODO: handle TOKEN_TEALPAINT
425
       ok to ignore: TOKEN_BOOKMARK, TOKEN_HRULE, TOKEN_LABEL, TOKEN_LINK
426
       unknown 10 other enumerataions
427
    */
428
832
  }
429
20.4k
  else
430
20.4k
    input->seek((long) pos, librevenge::RVNG_SEEK_SET);
431
432
21.2k
  return success;
433
21.2k
}
434
435
bool TealDocTextParser::parseHeaderTag(const Attributes_t &attributeList)
436
0
{
437
0
  TealDocAttributes attributes;
438
439
0
  for (const auto &it : attributeList)
440
0
  {
441
0
    if (it.first != TOKEN_UNKNOWN)
442
0
      boost::apply_visitor(AttributeHandler(it.first, attributes, m_text), it.second);
443
0
  }
444
445
0
  openParagraph(attributes);
446
0
  flushText(attributes);
447
0
  closeParagraph();
448
449
0
  return true;
450
0
}
451
452
void TealDocTextParser::openParagraph(const TealDocAttributes &attributes)
453
19.0k
{
454
19.0k
  librevenge::RVNGPropertyList props;
455
456
19.0k
  if (attributes.align)
457
0
  {
458
0
    switch (get(attributes.align))
459
0
    {
460
0
    case ALIGN_LEFT :
461
0
      props.insert("fo:text-align", "left");
462
0
      break;
463
0
    case ALIGN_RIGHT :
464
0
      props.insert("fo:text-align", "end");
465
0
      break;
466
0
    case ALIGN_CENTER :
467
0
      props.insert("fo:text-align", "center");
468
0
      break;
469
0
    default :
470
0
      break;
471
0
    }
472
0
  }
473
474
19.0k
  m_document->openParagraph(props);
475
19.0k
  m_openedParagraph = true;
476
19.0k
}
477
478
void TealDocTextParser::closeParagraph()
479
19.0k
{
480
19.0k
  if (m_openedParagraph)
481
19.0k
    m_document->closeParagraph();
482
19.0k
  m_openedParagraph = false;
483
19.0k
}
484
485
void TealDocTextParser::finishParagraph()
486
19.0k
{
487
19.0k
  flushText();
488
19.0k
  closeParagraph();
489
19.0k
}
490
491
void TealDocTextParser::flushText(const TealDocAttributes &attributes)
492
19.0k
{
493
19.0k
  if (!m_openedParagraph)
494
19.0k
    openParagraph(attributes);
495
496
19.0k
  if (!m_text.empty())
497
10.8k
  {
498
10.8k
    librevenge::RVNGPropertyList props;
499
500
10.8k
    if (attributes.font)
501
0
    {
502
0
      switch (get(attributes.font))
503
0
      {
504
0
      case FONT_BOLD :
505
0
        props.insert("fo:font-weight", "bold");
506
0
        break;
507
0
      case FONT_LARGE :
508
        // TODO: handle
509
0
        break;
510
0
      case FONT_NORMAL :
511
      // fall through
512
0
      default :
513
0
        break;
514
0
      }
515
0
    }
516
517
10.8k
    if (attributes.style)
518
0
    {
519
0
      switch (get(attributes.style))
520
0
      {
521
0
      case STYLE_UNDERLINE :
522
0
        props.insert("style:text-underline-type", "single");
523
0
        break;
524
0
      case STYLE_INVERT :
525
0
        props.insert("fo:color", "#FFFFFF");
526
0
        props.insert("fo:background-color", "#000000");
527
0
        break;
528
0
      case STYLE_NORMAL :
529
      // fall through
530
0
      default :
531
0
        break;
532
0
      }
533
0
    }
534
535
10.8k
    m_document->openSpan(props);
536
10.8k
    m_document->insertText(librevenge::RVNGString(m_text.c_str()));
537
10.8k
    m_document->closeSpan();
538
539
10.8k
    m_text.clear();
540
10.8k
  }
541
19.0k
}
542
543
TealDocParser::TealDocParser(librevenge::RVNGInputStream *input, librevenge::RVNGTextInterface *document)
544
2.69k
  : PDBParser(input, document, TEALDOC_TYPE, TEALDOC_CREATOR)
545
2.69k
  , m_compressed(false)
546
2.69k
  , m_textLength(0)
547
2.69k
  , m_recordCount(0)
548
2.69k
  , m_recordSize(0)
549
2.69k
  , m_read(0)
550
2.69k
  , m_openedDocument(false)
551
2.69k
  , m_converter()
552
2.69k
  , m_textParser(new TealDocTextParser(document))
553
2.69k
{
554
2.69k
}
555
556
TealDocParser::~TealDocParser()
557
2.68k
{
558
2.68k
}
559
560
bool TealDocParser::checkType(const unsigned type, const unsigned creator)
561
19.9k
{
562
19.9k
  return TEALDOC_TYPE == type && TEALDOC_CREATOR == creator;
563
19.9k
}
564
565
void TealDocParser::readAppInfoRecord(librevenge::RVNGInputStream *)
566
0
{
567
  // there is no appInfo in TealDoc
568
0
}
569
570
void TealDocParser::readSortInfoRecord(librevenge::RVNGInputStream *)
571
0
{
572
  // there is no sortInfo in TealDoc
573
0
}
574
575
void TealDocParser::readIndexRecord(librevenge::RVNGInputStream *const input)
576
1.31k
{
577
1.31k
  const uint16_t compression = readU16(input, true);
578
1.31k
  assert(1 == compression || 2 == compression);
579
1.31k
  m_compressed = 2 == compression;
580
1.31k
  skip(input, 2);
581
1.31k
  m_textLength = readU32(input, true);
582
1.31k
  m_recordCount = readU16(input, true);
583
1.31k
  m_recordSize = readU16(input, true);
584
585
  // check consistency
586
1.31k
  assert(m_recordCount == getDataRecordCount());
587
1.31k
  assert(TEALDOC_BLOCK_SIZE == m_recordSize);
588
1.31k
}
589
590
void TealDocParser::readDataRecord(librevenge::RVNGInputStream *input, const bool last)
591
3.60k
{
592
3.60k
  vector<char> uncompressed;
593
3.60k
  uncompressed.reserve(m_recordSize);
594
595
3.60k
  std::unique_ptr<librevenge::RVNGInputStream> compressedInput;
596
597
  // This should not happen, but it is the easier case anyway :-)
598
3.60k
  if (m_compressed)
599
444
  {
600
444
    compressedInput.reset(new PDBLZ77Stream(input));
601
444
    input = compressedInput.get();
602
444
  }
603
604
3.60k
  const long origPos = input->tell();
605
1.87M
  while (!input->isEnd())
606
1.86M
    uncompressed.push_back((char) readU8(input));
607
3.60k
  m_read += unsigned(input->tell() - origPos);
608
609
3.60k
  assert(m_read <= m_textLength);
610
3.60k
  if (last)
611
3.60k
    assert(m_read == m_textLength);
612
613
3.60k
  if (!m_openedDocument)
614
1.30k
  {
615
1.30k
    createConverter(uncompressed);
616
1.30k
    openDocument();
617
1.30k
  }
618
619
3.60k
  EBOOKMemoryStream uncompressedStrm(reinterpret_cast<unsigned char *>(&uncompressed[0]), (unsigned) uncompressed.size());
620
3.60k
  EBOOKUTF8Stream utf8Strm(&uncompressedStrm);
621
622
3.60k
  m_textParser->parse(&utf8Strm, last);
623
624
3.60k
  if (last)
625
693
    closeDocument();
626
3.60k
}
627
628
void TealDocParser::createConverter(const std::vector<char> &text)
629
1.30k
{
630
1.30k
  if (text.empty())
631
8
  {
632
8
    m_converter.reset(new EBOOKCharsetConverter("cp1252")); // try a default encoding
633
8
  }
634
1.29k
  else
635
1.29k
  {
636
1.29k
    std::unique_ptr<EBOOKCharsetConverter> converter(new EBOOKCharsetConverter());
637
1.29k
    if (converter->guessEncoding(&text[0], (unsigned) text.size()))
638
1.14k
      m_converter = std::move(converter);
639
153
    else
640
153
      throw GenericException();
641
1.29k
  }
642
1.30k
}
643
644
void TealDocParser::openDocument()
645
1.15k
{
646
1.15k
  if (m_openedDocument)
647
0
    return;
648
649
1.15k
  getDocument()->startDocument(librevenge::RVNGPropertyList());
650
651
1.15k
  librevenge::RVNGPropertyList metadata;
652
1.15k
  if (*getName())
653
889
  {
654
889
    vector<char> nameUtf8;
655
889
    if (m_converter->convertBytes(getName(), (unsigned int)std::strlen(getName()), nameUtf8) && !nameUtf8.empty())
656
888
    {
657
888
      nameUtf8.push_back(0);
658
888
      metadata.insert("dc:title", librevenge::RVNGString(&nameUtf8[0]));
659
888
    }
660
889
  }
661
662
1.15k
  getDocument()->setDocumentMetaData(metadata);
663
1.15k
  getDocument()->openPageSpan(getDefaultPageSpanPropList());
664
665
1.15k
  m_openedDocument = true;
666
1.15k
}
667
668
void TealDocParser::closeDocument()
669
693
{
670
693
  getDocument()->closePageSpan();
671
693
  getDocument()->endDocument();
672
693
  m_openedDocument = false;
673
693
}
674
675
}
676
677
/* vim:set shiftwidth=2 softtabstop=2 expandtab: */