/src/libe-book/src/lib/TealDocParser.cpp
Line | Count | Source |
1 | | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2 | | /* |
3 | | * This file is part of the libe-book project. |
4 | | * |
5 | | * This Source Code Form is subject to the terms of the Mozilla Public |
6 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
7 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8 | | */ |
9 | | |
10 | | #include <algorithm> |
11 | | #include <cassert> |
12 | | #include <cstring> |
13 | | #include <deque> |
14 | | #include <utility> |
15 | | |
16 | | #include <boost/fusion/include/std_pair.hpp> |
17 | | #include <boost/optional.hpp> |
18 | | #include <boost/spirit/include/qi.hpp> |
19 | | #include <boost/variant.hpp> |
20 | | |
21 | | #include <librevenge/librevenge.h> |
22 | | |
23 | | #include "libebook_utils.h" |
24 | | #include "EBOOKCharsetConverter.h" |
25 | | #include "EBOOKMemoryStream.h" |
26 | | #include "EBOOKUTF8Stream.h" |
27 | | #include "TealDocParser.h" |
28 | | #include "PDBLZ77Stream.h" |
29 | | |
30 | | using boost::optional; |
31 | | |
32 | | using std::deque; |
33 | | using std::string; |
34 | | using std::vector; |
35 | | |
36 | | namespace libebook |
37 | | { |
38 | | |
39 | | namespace |
40 | | { |
41 | | |
42 | | static const unsigned TEALDOC_BLOCK_SIZE = 4096; |
43 | | |
44 | | static const unsigned TEALDOC_TYPE = PDB_CODE("TEXt"); |
45 | | static const unsigned TEALDOC_CREATOR = PDB_CODE("TlDc"); |
46 | | |
47 | | } |
48 | | |
49 | | namespace |
50 | | { |
51 | | |
52 | | enum Font |
53 | | { |
54 | | FONT_NORMAL, |
55 | | FONT_BOLD, |
56 | | FONT_LARGE |
57 | | }; |
58 | | |
59 | | enum Style |
60 | | { |
61 | | STYLE_NORMAL, |
62 | | STYLE_UNDERLINE, |
63 | | STYLE_INVERT |
64 | | }; |
65 | | |
66 | | enum Align |
67 | | { |
68 | | ALIGN_LEFT, |
69 | | ALIGN_RIGHT, |
70 | | ALIGN_CENTER |
71 | | }; |
72 | | |
73 | | enum Token |
74 | | { |
75 | | TOKEN_UNKNOWN, |
76 | | // tags |
77 | | TOKEN_BOOKMARK, |
78 | | TOKEN_HEADER, |
79 | | TOKEN_HRULE, |
80 | | TOKEN_LABEL, |
81 | | TOKEN_LINK, |
82 | | TOKEN_TEALPAINT, |
83 | | // attributes |
84 | | TOKEN_ALIGN, |
85 | | TOKEN_FONT, |
86 | | TOKEN_STYLE, |
87 | | TOKEN_TEXT, |
88 | | // values |
89 | | TOKEN_0, |
90 | | TOKEN_1, |
91 | | TOKEN_2, |
92 | | TOKEN_CENTER, |
93 | | TOKEN_INVERT, |
94 | | TOKEN_LEFT, |
95 | | TOKEN_NORMAL, |
96 | | TOKEN_RIGHT, |
97 | | TOKEN_UNDERLINE |
98 | | }; |
99 | | |
100 | | struct TealDocAttributes |
101 | | { |
102 | | TealDocAttributes(); |
103 | | |
104 | | optional<Font> font; |
105 | | optional<Style> style; |
106 | | optional<Align> align; |
107 | | }; |
108 | | |
109 | | TealDocAttributes::TealDocAttributes() |
110 | 19.0k | : font() |
111 | 19.0k | , style() |
112 | 19.0k | , align() |
113 | 19.0k | { |
114 | 19.0k | } |
115 | | |
116 | | } |
117 | | |
118 | | namespace |
119 | | { |
120 | | |
121 | | namespace qi = boost::spirit::qi; |
122 | | |
123 | | typedef boost::variant<int, string> AttrValue_t; |
124 | | typedef deque<std::pair<int, AttrValue_t>> Attributes_t; |
125 | | |
126 | | template<typename Iterator> |
127 | | struct TagGrammar : public qi::grammar<Iterator, std::pair<int, Attributes_t>, qi::space_type> |
128 | | { |
129 | 20.9k | TagGrammar() : TagGrammar::base_type(tag, "tag") |
130 | 20.9k | { |
131 | 20.9k | using qi::alnum; |
132 | 20.9k | using qi::alpha; |
133 | 20.9k | using qi::attr; |
134 | 20.9k | using qi::char_; |
135 | 20.9k | using qi::lexeme; |
136 | 20.9k | using qi::no_case; |
137 | 20.9k | using qi::omit; |
138 | 20.9k | using qi::space; |
139 | | |
140 | 20.9k | tag %= '<' >> (no_case[tags] | unknown) >> *attrib >> '>'; |
141 | | |
142 | 20.9k | attrib %= |
143 | 20.9k | no_case[eattrs] >> '=' >> (values | text) // accept unknown values too |
144 | 20.9k | | no_case[sattrs] >> '=' >> text |
145 | 20.9k | | unknown >> '=' >> text |
146 | 20.9k | ; |
147 | | |
148 | 20.9k | value %= |
149 | 20.9k | '\'' >> values >> '\'' |
150 | 20.9k | | '\"' >> values >> '\"' |
151 | 20.9k | | values |
152 | 20.9k | ; |
153 | | |
154 | 20.9k | text %= |
155 | 20.9k | '\'' >> lexeme[+(char_ - '\'')] >> '\'' |
156 | 20.9k | | '\"' >> lexeme[+(char_ - '\"')] >> '\"' |
157 | 20.9k | | lexeme[+(char_ - space)] |
158 | 20.9k | ; |
159 | | |
160 | 20.9k | unknown %= omit[alpha >> *alnum] >> attr(TOKEN_UNKNOWN); |
161 | | |
162 | 20.9k | tag.name("tag"); |
163 | 20.9k | attrib.name("attrib"); |
164 | 20.9k | value.name("value"); |
165 | 20.9k | text.name("text"); |
166 | 20.9k | unknown.name("unknown"); |
167 | | |
168 | 20.9k | tags.name("tags"); |
169 | 20.9k | eattrs.name("eattrs"); |
170 | 20.9k | sattrs.name("sattrs"); |
171 | 20.9k | values.name("values"); |
172 | 20.9k | } |
173 | | |
174 | | struct tags_ : qi::symbols<char, int> |
175 | | { |
176 | | tags_() |
177 | 20.9k | { |
178 | 20.9k | add |
179 | 20.9k | ("bookmark", TOKEN_BOOKMARK) |
180 | 20.9k | ("header", TOKEN_HEADER) |
181 | 20.9k | ("hrule", TOKEN_HRULE) |
182 | 20.9k | ("label", TOKEN_LABEL) |
183 | 20.9k | ("link", TOKEN_LINK) |
184 | 20.9k | ("tealpaint", TOKEN_TEALPAINT) |
185 | 20.9k | ; |
186 | 20.9k | } |
187 | | } tags; |
188 | | |
189 | | struct eattrs_ : qi::symbols<char, int> |
190 | | { |
191 | | eattrs_() |
192 | 20.9k | { |
193 | 20.9k | add |
194 | 20.9k | ("align", TOKEN_ALIGN) |
195 | 20.9k | ("font", TOKEN_FONT) |
196 | 20.9k | ("style", TOKEN_STYLE) |
197 | 20.9k | ; |
198 | 20.9k | } |
199 | | } eattrs; |
200 | | |
201 | | struct sattrs_ : qi::symbols<char, int> |
202 | | { |
203 | | sattrs_() |
204 | 20.9k | { |
205 | 20.9k | add |
206 | 20.9k | ("text", TOKEN_TEXT) |
207 | 20.9k | ; |
208 | 20.9k | } |
209 | | } sattrs; |
210 | | |
211 | | struct values_ : qi::symbols<char, int> |
212 | | { |
213 | | values_() |
214 | 20.9k | { |
215 | 20.9k | add |
216 | 20.9k | ("0", TOKEN_0) |
217 | 20.9k | ("1", TOKEN_1) |
218 | 20.9k | ("2", TOKEN_2) |
219 | 20.9k | ("center", TOKEN_CENTER) |
220 | 20.9k | ("invert", TOKEN_INVERT) |
221 | 20.9k | ("left", TOKEN_LEFT) |
222 | 20.9k | ("normal", TOKEN_NORMAL) |
223 | 20.9k | ("right", TOKEN_RIGHT) |
224 | 20.9k | ("underline", TOKEN_UNDERLINE) |
225 | 20.9k | ; |
226 | 20.9k | } |
227 | | } values; |
228 | | |
229 | | qi::rule<Iterator, std::pair<int, Attributes_t>, qi::space_type> tag; |
230 | | qi::rule<Iterator, std::pair<int, AttrValue_t>, qi::space_type> attrib; |
231 | | qi::rule<Iterator, AttrValue_t, qi::space_type> value; |
232 | | qi::rule<Iterator, AttrValue_t, qi::space_type> text; |
233 | | qi::rule<Iterator, int, qi::space_type> unknown; |
234 | | }; |
235 | | |
236 | | } |
237 | | |
238 | | namespace |
239 | | { |
240 | | |
241 | | class AttributeHandler : public boost::static_visitor<> |
242 | | { |
243 | | public: |
244 | | AttributeHandler(const int tokenId, TealDocAttributes &attributes, string &text) |
245 | 0 | : m_attr(tokenId) |
246 | 0 | , m_attributes(attributes) |
247 | 0 | , m_text(text) |
248 | 0 | { |
249 | 0 | } |
250 | | |
251 | | void operator()(const int value) const |
252 | 0 | { |
253 | 0 | switch (m_attr) |
254 | 0 | { |
255 | 0 | case TOKEN_ALIGN : |
256 | 0 | { |
257 | 0 | switch (value) |
258 | 0 | { |
259 | 0 | case TOKEN_CENTER : |
260 | 0 | m_attributes.align = ALIGN_CENTER; |
261 | 0 | break; |
262 | 0 | case TOKEN_LEFT : |
263 | 0 | m_attributes.align = ALIGN_LEFT; |
264 | 0 | break; |
265 | 0 | case TOKEN_RIGHT : |
266 | 0 | m_attributes.align = ALIGN_RIGHT; |
267 | 0 | break; |
268 | 0 | default : |
269 | 0 | EBOOK_DEBUG_MSG(("unknown alignment %d\n", value)); |
270 | 0 | } |
271 | 0 | break; |
272 | 0 | } |
273 | 0 | case TOKEN_FONT : |
274 | 0 | { |
275 | 0 | switch (value) |
276 | 0 | { |
277 | 0 | case TOKEN_0 : |
278 | 0 | m_attributes.font = FONT_NORMAL; |
279 | 0 | break; |
280 | 0 | case TOKEN_1 : |
281 | 0 | m_attributes.font = FONT_BOLD; |
282 | 0 | break; |
283 | 0 | case TOKEN_2 : |
284 | 0 | m_attributes.font = FONT_LARGE; |
285 | 0 | break; |
286 | 0 | default : |
287 | 0 | EBOOK_DEBUG_MSG(("unknown font type %d\n", value)); |
288 | 0 | } |
289 | 0 | break; |
290 | 0 | } |
291 | 0 | case TOKEN_STYLE : |
292 | 0 | { |
293 | 0 | switch (value) |
294 | 0 | { |
295 | 0 | case TOKEN_INVERT : |
296 | 0 | m_attributes.style = STYLE_INVERT; |
297 | 0 | break; |
298 | 0 | case TOKEN_NORMAL : |
299 | 0 | m_attributes.style = STYLE_NORMAL; |
300 | 0 | break; |
301 | 0 | case TOKEN_UNDERLINE : |
302 | 0 | m_attributes.style = STYLE_UNDERLINE; |
303 | 0 | break; |
304 | 0 | default : |
305 | 0 | EBOOK_DEBUG_MSG(("unknown style %d\n", value)); |
306 | 0 | } |
307 | 0 | break; |
308 | 0 | } |
309 | 0 | default : |
310 | 0 | break; |
311 | 0 | } |
312 | 0 | } |
313 | | |
314 | | void operator()(const std::string &value) const |
315 | 0 | { |
316 | 0 | if (m_attr == TOKEN_TEXT) |
317 | 0 | m_text = value; |
318 | 0 | } |
319 | | |
320 | | private: |
321 | | const int m_attr; |
322 | | TealDocAttributes &m_attributes; |
323 | | string &m_text; |
324 | | }; |
325 | | |
326 | | } |
327 | | |
328 | | class TealDocTextParser |
329 | | { |
330 | | // -Weffc++ |
331 | | TealDocTextParser(const TealDocTextParser &other); |
332 | | TealDocTextParser operator=(const TealDocTextParser &other); |
333 | | |
334 | | public: |
335 | | explicit TealDocTextParser(librevenge::RVNGTextInterface *document); |
336 | | |
337 | | void parse(librevenge::RVNGInputStream *input, bool last = false); |
338 | | |
339 | | private: |
340 | | bool parseTag(librevenge::RVNGInputStream *input); |
341 | | |
342 | | bool parseHeaderTag(const Attributes_t &attributeList); |
343 | | |
344 | | void openParagraph(const TealDocAttributes &attributes = TealDocAttributes()); |
345 | | void closeParagraph(); |
346 | | |
347 | | void finishParagraph(); |
348 | | void flushText(const TealDocAttributes &attributes = TealDocAttributes()); |
349 | | |
350 | | private: |
351 | | librevenge::RVNGTextInterface *const m_document; |
352 | | |
353 | | string m_text; |
354 | | |
355 | | bool m_openedParagraph; |
356 | | }; |
357 | | |
358 | | TealDocTextParser::TealDocTextParser(librevenge::RVNGTextInterface *const document) |
359 | 2.68k | : m_document(document) |
360 | 2.68k | , m_text() |
361 | 2.68k | , m_openedParagraph(false) |
362 | 2.68k | { |
363 | 2.68k | } |
364 | | |
365 | | void TealDocTextParser::parse(librevenge::RVNGInputStream *const input, const bool last) |
366 | 3.40k | { |
367 | 986k | while (!input->isEnd()) |
368 | 983k | { |
369 | 983k | const unsigned char c = readU8(input); |
370 | | |
371 | 983k | switch (c) |
372 | 983k | { |
373 | 18.3k | case '\n' : |
374 | 18.3k | finishParagraph(); |
375 | 18.3k | break; |
376 | 21.2k | case '<' : |
377 | 21.2k | if (!parseTag(input)) |
378 | 20.1k | m_text.push_back('<'); |
379 | 21.2k | break; |
380 | 943k | default : |
381 | 943k | m_text.push_back((char) c); |
382 | 983k | } |
383 | 983k | } |
384 | | |
385 | 3.12k | if (last) |
386 | 693 | finishParagraph(); |
387 | 3.12k | } |
388 | | |
389 | | bool TealDocTextParser::parseTag(librevenge::RVNGInputStream *const input) |
390 | 21.2k | { |
391 | 21.2k | const auto pos = (unsigned long) input->tell(); |
392 | 21.2k | string tag("<"); |
393 | | |
394 | | // read tag into string |
395 | | // I suppose it would be possible to create an iterator adaptor for |
396 | | // librevenge::RVNGInputStream, but this is much simpler |
397 | 21.2k | unsigned char c = 0; |
398 | 21.2k | do |
399 | 12.6M | { |
400 | 12.6M | c = readU8(input); |
401 | 12.6M | tag.push_back((char) c); |
402 | 12.6M | } |
403 | 12.6M | while ('>' != c); |
404 | | |
405 | | // parse tag |
406 | 21.2k | std::pair<int, Attributes_t> parsedTag; |
407 | | |
408 | 21.2k | auto it = tag.cbegin(); |
409 | 21.2k | const bool match = qi::phrase_parse( |
410 | 21.2k | it, tag.cend(), |
411 | 21.2k | TagGrammar<string::const_iterator>(), qi::space, |
412 | 21.2k | parsedTag |
413 | 21.2k | ); |
414 | 21.2k | const bool success = match && it == tag.end(); |
415 | | |
416 | | // process tag |
417 | 21.2k | if (success) |
418 | 832 | { |
419 | 832 | if (parsedTag.first == TOKEN_HEADER) |
420 | 0 | { |
421 | 0 | finishParagraph(); |
422 | 0 | parseHeaderTag(parsedTag.second); |
423 | 0 | } |
424 | | /* TODO: handle TOKEN_TEALPAINT |
425 | | ok to ignore: TOKEN_BOOKMARK, TOKEN_HRULE, TOKEN_LABEL, TOKEN_LINK |
426 | | unknown 10 other enumerataions |
427 | | */ |
428 | 832 | } |
429 | 20.4k | else |
430 | 20.4k | input->seek((long) pos, librevenge::RVNG_SEEK_SET); |
431 | | |
432 | 21.2k | return success; |
433 | 21.2k | } |
434 | | |
435 | | bool TealDocTextParser::parseHeaderTag(const Attributes_t &attributeList) |
436 | 0 | { |
437 | 0 | TealDocAttributes attributes; |
438 | |
|
439 | 0 | for (const auto &it : attributeList) |
440 | 0 | { |
441 | 0 | if (it.first != TOKEN_UNKNOWN) |
442 | 0 | boost::apply_visitor(AttributeHandler(it.first, attributes, m_text), it.second); |
443 | 0 | } |
444 | |
|
445 | 0 | openParagraph(attributes); |
446 | 0 | flushText(attributes); |
447 | 0 | closeParagraph(); |
448 | |
|
449 | 0 | return true; |
450 | 0 | } |
451 | | |
452 | | void TealDocTextParser::openParagraph(const TealDocAttributes &attributes) |
453 | 19.0k | { |
454 | 19.0k | librevenge::RVNGPropertyList props; |
455 | | |
456 | 19.0k | if (attributes.align) |
457 | 0 | { |
458 | 0 | switch (get(attributes.align)) |
459 | 0 | { |
460 | 0 | case ALIGN_LEFT : |
461 | 0 | props.insert("fo:text-align", "left"); |
462 | 0 | break; |
463 | 0 | case ALIGN_RIGHT : |
464 | 0 | props.insert("fo:text-align", "end"); |
465 | 0 | break; |
466 | 0 | case ALIGN_CENTER : |
467 | 0 | props.insert("fo:text-align", "center"); |
468 | 0 | break; |
469 | 0 | default : |
470 | 0 | break; |
471 | 0 | } |
472 | 0 | } |
473 | | |
474 | 19.0k | m_document->openParagraph(props); |
475 | 19.0k | m_openedParagraph = true; |
476 | 19.0k | } |
477 | | |
478 | | void TealDocTextParser::closeParagraph() |
479 | 19.0k | { |
480 | 19.0k | if (m_openedParagraph) |
481 | 19.0k | m_document->closeParagraph(); |
482 | 19.0k | m_openedParagraph = false; |
483 | 19.0k | } |
484 | | |
485 | | void TealDocTextParser::finishParagraph() |
486 | 19.0k | { |
487 | 19.0k | flushText(); |
488 | 19.0k | closeParagraph(); |
489 | 19.0k | } |
490 | | |
491 | | void TealDocTextParser::flushText(const TealDocAttributes &attributes) |
492 | 19.0k | { |
493 | 19.0k | if (!m_openedParagraph) |
494 | 19.0k | openParagraph(attributes); |
495 | | |
496 | 19.0k | if (!m_text.empty()) |
497 | 10.8k | { |
498 | 10.8k | librevenge::RVNGPropertyList props; |
499 | | |
500 | 10.8k | if (attributes.font) |
501 | 0 | { |
502 | 0 | switch (get(attributes.font)) |
503 | 0 | { |
504 | 0 | case FONT_BOLD : |
505 | 0 | props.insert("fo:font-weight", "bold"); |
506 | 0 | break; |
507 | 0 | case FONT_LARGE : |
508 | | // TODO: handle |
509 | 0 | break; |
510 | 0 | case FONT_NORMAL : |
511 | | // fall through |
512 | 0 | default : |
513 | 0 | break; |
514 | 0 | } |
515 | 0 | } |
516 | | |
517 | 10.8k | if (attributes.style) |
518 | 0 | { |
519 | 0 | switch (get(attributes.style)) |
520 | 0 | { |
521 | 0 | case STYLE_UNDERLINE : |
522 | 0 | props.insert("style:text-underline-type", "single"); |
523 | 0 | break; |
524 | 0 | case STYLE_INVERT : |
525 | 0 | props.insert("fo:color", "#FFFFFF"); |
526 | 0 | props.insert("fo:background-color", "#000000"); |
527 | 0 | break; |
528 | 0 | case STYLE_NORMAL : |
529 | | // fall through |
530 | 0 | default : |
531 | 0 | break; |
532 | 0 | } |
533 | 0 | } |
534 | | |
535 | 10.8k | m_document->openSpan(props); |
536 | 10.8k | m_document->insertText(librevenge::RVNGString(m_text.c_str())); |
537 | 10.8k | m_document->closeSpan(); |
538 | | |
539 | 10.8k | m_text.clear(); |
540 | 10.8k | } |
541 | 19.0k | } |
542 | | |
543 | | TealDocParser::TealDocParser(librevenge::RVNGInputStream *input, librevenge::RVNGTextInterface *document) |
544 | 2.69k | : PDBParser(input, document, TEALDOC_TYPE, TEALDOC_CREATOR) |
545 | 2.69k | , m_compressed(false) |
546 | 2.69k | , m_textLength(0) |
547 | 2.69k | , m_recordCount(0) |
548 | 2.69k | , m_recordSize(0) |
549 | 2.69k | , m_read(0) |
550 | 2.69k | , m_openedDocument(false) |
551 | 2.69k | , m_converter() |
552 | 2.69k | , m_textParser(new TealDocTextParser(document)) |
553 | 2.69k | { |
554 | 2.69k | } |
555 | | |
556 | | TealDocParser::~TealDocParser() |
557 | 2.68k | { |
558 | 2.68k | } |
559 | | |
560 | | bool TealDocParser::checkType(const unsigned type, const unsigned creator) |
561 | 19.9k | { |
562 | 19.9k | return TEALDOC_TYPE == type && TEALDOC_CREATOR == creator; |
563 | 19.9k | } |
564 | | |
565 | | void TealDocParser::readAppInfoRecord(librevenge::RVNGInputStream *) |
566 | 0 | { |
567 | | // there is no appInfo in TealDoc |
568 | 0 | } |
569 | | |
570 | | void TealDocParser::readSortInfoRecord(librevenge::RVNGInputStream *) |
571 | 0 | { |
572 | | // there is no sortInfo in TealDoc |
573 | 0 | } |
574 | | |
575 | | void TealDocParser::readIndexRecord(librevenge::RVNGInputStream *const input) |
576 | 1.31k | { |
577 | 1.31k | const uint16_t compression = readU16(input, true); |
578 | 1.31k | assert(1 == compression || 2 == compression); |
579 | 1.31k | m_compressed = 2 == compression; |
580 | 1.31k | skip(input, 2); |
581 | 1.31k | m_textLength = readU32(input, true); |
582 | 1.31k | m_recordCount = readU16(input, true); |
583 | 1.31k | m_recordSize = readU16(input, true); |
584 | | |
585 | | // check consistency |
586 | 1.31k | assert(m_recordCount == getDataRecordCount()); |
587 | 1.31k | assert(TEALDOC_BLOCK_SIZE == m_recordSize); |
588 | 1.31k | } |
589 | | |
590 | | void TealDocParser::readDataRecord(librevenge::RVNGInputStream *input, const bool last) |
591 | 3.60k | { |
592 | 3.60k | vector<char> uncompressed; |
593 | 3.60k | uncompressed.reserve(m_recordSize); |
594 | | |
595 | 3.60k | std::unique_ptr<librevenge::RVNGInputStream> compressedInput; |
596 | | |
597 | | // This should not happen, but it is the easier case anyway :-) |
598 | 3.60k | if (m_compressed) |
599 | 444 | { |
600 | 444 | compressedInput.reset(new PDBLZ77Stream(input)); |
601 | 444 | input = compressedInput.get(); |
602 | 444 | } |
603 | | |
604 | 3.60k | const long origPos = input->tell(); |
605 | 1.87M | while (!input->isEnd()) |
606 | 1.86M | uncompressed.push_back((char) readU8(input)); |
607 | 3.60k | m_read += unsigned(input->tell() - origPos); |
608 | | |
609 | 3.60k | assert(m_read <= m_textLength); |
610 | 3.60k | if (last) |
611 | 3.60k | assert(m_read == m_textLength); |
612 | | |
613 | 3.60k | if (!m_openedDocument) |
614 | 1.30k | { |
615 | 1.30k | createConverter(uncompressed); |
616 | 1.30k | openDocument(); |
617 | 1.30k | } |
618 | | |
619 | 3.60k | EBOOKMemoryStream uncompressedStrm(reinterpret_cast<unsigned char *>(&uncompressed[0]), (unsigned) uncompressed.size()); |
620 | 3.60k | EBOOKUTF8Stream utf8Strm(&uncompressedStrm); |
621 | | |
622 | 3.60k | m_textParser->parse(&utf8Strm, last); |
623 | | |
624 | 3.60k | if (last) |
625 | 693 | closeDocument(); |
626 | 3.60k | } |
627 | | |
628 | | void TealDocParser::createConverter(const std::vector<char> &text) |
629 | 1.30k | { |
630 | 1.30k | if (text.empty()) |
631 | 8 | { |
632 | 8 | m_converter.reset(new EBOOKCharsetConverter("cp1252")); // try a default encoding |
633 | 8 | } |
634 | 1.29k | else |
635 | 1.29k | { |
636 | 1.29k | std::unique_ptr<EBOOKCharsetConverter> converter(new EBOOKCharsetConverter()); |
637 | 1.29k | if (converter->guessEncoding(&text[0], (unsigned) text.size())) |
638 | 1.14k | m_converter = std::move(converter); |
639 | 153 | else |
640 | 153 | throw GenericException(); |
641 | 1.29k | } |
642 | 1.30k | } |
643 | | |
644 | | void TealDocParser::openDocument() |
645 | 1.15k | { |
646 | 1.15k | if (m_openedDocument) |
647 | 0 | return; |
648 | | |
649 | 1.15k | getDocument()->startDocument(librevenge::RVNGPropertyList()); |
650 | | |
651 | 1.15k | librevenge::RVNGPropertyList metadata; |
652 | 1.15k | if (*getName()) |
653 | 889 | { |
654 | 889 | vector<char> nameUtf8; |
655 | 889 | if (m_converter->convertBytes(getName(), (unsigned int)std::strlen(getName()), nameUtf8) && !nameUtf8.empty()) |
656 | 888 | { |
657 | 888 | nameUtf8.push_back(0); |
658 | 888 | metadata.insert("dc:title", librevenge::RVNGString(&nameUtf8[0])); |
659 | 888 | } |
660 | 889 | } |
661 | | |
662 | 1.15k | getDocument()->setDocumentMetaData(metadata); |
663 | 1.15k | getDocument()->openPageSpan(getDefaultPageSpanPropList()); |
664 | | |
665 | 1.15k | m_openedDocument = true; |
666 | 1.15k | } |
667 | | |
668 | | void TealDocParser::closeDocument() |
669 | 693 | { |
670 | 693 | getDocument()->closePageSpan(); |
671 | 693 | getDocument()->endDocument(); |
672 | 693 | m_openedDocument = false; |
673 | 693 | } |
674 | | |
675 | | } |
676 | | |
677 | | /* vim:set shiftwidth=2 softtabstop=2 expandtab: */ |