/src/libwpd/src/lib/WP6Parser.cpp
Line | Count | Source |
1 | | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */ |
2 | | /* libwpd |
3 | | * Version: MPL 2.0 / LGPLv2.1+ |
4 | | * |
5 | | * This Source Code Form is subject to the terms of the Mozilla Public |
6 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
7 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8 | | * |
9 | | * Major Contributor(s): |
10 | | * Copyright (C) 2002 William Lachance (wrlach@gmail.com) |
11 | | * Copyright (C) 2002 Marc Maurer (uwog@uwog.net) |
12 | | * |
13 | | * For minor contributions see the git repository. |
14 | | * |
15 | | * Alternatively, the contents of this file may be used under the terms |
16 | | * of the GNU Lesser General Public License Version 2.1 or later |
17 | | * (LGPLv2.1+), in which case the provisions of the LGPLv2.1+ are |
18 | | * applicable instead of those above. |
19 | | * |
20 | | * For further information visit http://libwpd.sourceforge.net |
21 | | */ |
22 | | |
23 | | /* "This product is not manufactured, approved, or supported by |
24 | | * Corel Corporation or Corel Corporation Limited." |
25 | | */ |
26 | | |
27 | | #include <memory> |
28 | | |
29 | | #include "WP6StylesListener.h" |
30 | | #include "WP6ContentListener.h" |
31 | | #include "WP6Parser.h" |
32 | | #include "WPXHeader.h" |
33 | | #include "WP6Header.h" |
34 | | #include "WP60Header.h" |
35 | | #include "WP61Header.h" |
36 | | #include "WP6PrefixData.h" |
37 | | #include "WP6Part.h" |
38 | | #include "libwpd_internal.h" |
39 | | #include "WP6DefaultInitialFontPacket.h" |
40 | | #include "WPXTable.h" |
41 | | |
42 | | WP6Parser::WP6Parser(librevenge::RVNGInputStream *input, WPXHeader *header, WPXEncryption *encryption) : |
43 | 16.5k | WPXParser(input, header, encryption) |
44 | 16.5k | { |
45 | 16.5k | } |
46 | | |
47 | | WP6Parser::~WP6Parser() |
48 | 16.5k | { |
49 | 16.5k | } |
50 | | |
51 | | WP6PrefixData *WP6Parser::getPrefixData(librevenge::RVNGInputStream *input, WPXEncryption *encryption) |
52 | 0 | { |
53 | 0 | WP6PrefixData *prefixData = nullptr; |
54 | 0 | try |
55 | 0 | { |
56 | 0 | prefixData = new WP6PrefixData(input, encryption, (static_cast<WP6Header *>(getHeader())->getNumPrefixIndices())); |
57 | 0 | return prefixData; |
58 | 0 | } |
59 | 0 | catch (FileException) |
60 | 0 | { |
61 | 0 | WPD_DEBUG_MSG(("WordPerfect: Prefix Data most likely corrupted.\n")); |
62 | | // TODO: Try to check packet after packet so that we try to recover at least the begining if the corruption is not at |
63 | | // the begining. |
64 | 0 | throw FileException(); |
65 | 0 | } |
66 | 0 | catch (...) |
67 | 0 | { |
68 | 0 | WPD_DEBUG_MSG(("WordPerfect: Prefix Data most likely corrupted. Trying to ignore.\n")); |
69 | | // TODO: Try to check packet after packet so that we try to recover at least the begining if the corruption is not at |
70 | | // the begining. |
71 | 0 | return nullptr; |
72 | 0 | } |
73 | 0 | } |
74 | | |
75 | | void WP6Parser::parse(librevenge::RVNGInputStream *input, WPXEncryption *encryption, WP6Listener *listener) |
76 | 0 | { |
77 | 0 | listener->startDocument(); |
78 | |
|
79 | 0 | input->seek(getHeader()->getDocumentOffset(), librevenge::RVNG_SEEK_SET); |
80 | |
|
81 | 0 | WPD_DEBUG_MSG(("WordPerfect: Starting document body parse (position = %ld)\n",(long)input->tell())); |
82 | |
|
83 | 0 | parseDocument(input, encryption, listener); |
84 | |
|
85 | 0 | listener->endDocument(); |
86 | 0 | } |
87 | | |
88 | | static const unsigned short extendedInternationalCharacterMap[] = |
89 | | { |
90 | | 229, // lower case 'a' with a small circle |
91 | | 197, // upper case 'a' with a small circle |
92 | | 230, // lower case 'ae' |
93 | | 198, // upper case 'ae' |
94 | | 228, // lower case 'a' with diathesis |
95 | | 196, // upper case 'a' with diathesis |
96 | | 225, // lower case 'a' with acute |
97 | | 224, // lower case 'a' with grave |
98 | | 226, // lower case 'a' with circonflex |
99 | | 227, // lower case 'a' with tilde |
100 | | 195, // upper case 'a' with tilde |
101 | | 231, // lower case 'c' with hook |
102 | | 199, // upper case 'c' with hook |
103 | | 235, // lower case 'e' with diathesis |
104 | | 233, // lower case 'e' with acute |
105 | | 201, // upper case 'e' with acute |
106 | | 232, // lower case 'e' with grave |
107 | | 234, // lower case 'e' with circonflex |
108 | | 237, // lower case 'i' with acute |
109 | | 241, // lower case 'n' with tilde |
110 | | 209, // upper case 'n' with tilde |
111 | | 248, // lower case 'o' with stroke |
112 | | 216, // upper case 'o' with stroke |
113 | | 245, // lower case 'o' with tilde |
114 | | 213, // upper case 'o' with tilde |
115 | | 246, // lower case 'o' with diathesis |
116 | | 214, // upper case 'o' with diathesis |
117 | | 252, // lower case 'u' with diathesis |
118 | | 220, // upper case 'u' with diathesis |
119 | | 250, // lower case 'u' with acute |
120 | | 249, // lower case 'u' with grave |
121 | | 223 // double s |
122 | | }; |
123 | | |
124 | | // parseDocument: parses a document body (may call itself recursively, on other streams, or itself) |
125 | | void WP6Parser::parseDocument(librevenge::RVNGInputStream *input, WPXEncryption *encryption, WP6Listener *listener) |
126 | 26.5k | { |
127 | 36.8M | while (!input->isEnd()) |
128 | 36.8M | { |
129 | 36.8M | unsigned char readVal; |
130 | 36.8M | readVal = readU8(input, encryption); |
131 | | |
132 | 36.8M | if (readVal == (unsigned char)0x00) |
133 | 5.63M | { |
134 | | // do nothing: this token is meaningless and is likely just corruption |
135 | 5.63M | } |
136 | 31.1M | else if (readVal <= (unsigned char)0x20) |
137 | 13.9M | { |
138 | 13.9M | listener->insertCharacter(extendedInternationalCharacterMap[(readVal-1)]); |
139 | 13.9M | } |
140 | 17.2M | else if (readVal >= (unsigned char)0x21 && readVal <= (unsigned char)0x7F) |
141 | 5.95M | { |
142 | | // normal ASCII characters |
143 | 5.95M | listener->insertCharacter((unsigned)readVal); |
144 | 5.95M | } |
145 | 11.2M | else |
146 | 11.2M | { |
147 | 11.2M | std::unique_ptr<WP6Part> part(WP6Part::constructPart(input, encryption, readVal)); |
148 | 11.2M | if (part) |
149 | 7.57M | part->parse(listener); |
150 | 11.2M | } |
151 | 36.8M | } |
152 | 26.5k | } |
153 | | |
154 | | void WP6Parser::parsePacket(WP6PrefixData *prefixData, int type, WP6Listener *listener) |
155 | 0 | { |
156 | 0 | if (!prefixData) |
157 | 0 | return; |
158 | | |
159 | 0 | std::pair< MPDP_CIter, MPDP_CIter > typeIterPair = prefixData->getPrefixDataPacketsOfType(type); |
160 | 0 | if (typeIterPair.first != typeIterPair.second) |
161 | 0 | { |
162 | 0 | typeIterPair.first->second->parse(listener); |
163 | 0 | } |
164 | 0 | } |
165 | | |
166 | | void WP6Parser::parsePackets(WP6PrefixData *prefixData, int type, WP6Listener *listener) |
167 | 0 | { |
168 | 0 | if (!prefixData) |
169 | 0 | return; |
170 | | |
171 | 0 | std::pair< MPDP_CIter, MPDP_CIter > typeIterPair = prefixData->getPrefixDataPacketsOfType(type); |
172 | 0 | for (auto iter=typeIterPair.first; iter != typeIterPair.second; ++iter) |
173 | 0 | { |
174 | 0 | iter->second->parse(listener); |
175 | 0 | } |
176 | 0 | } |
177 | | |
178 | | // WP6Parser::parse() reads AND parses a wordperfect document, passing any retrieved low-level |
179 | | // information to a low-level listener |
180 | | void WP6Parser::parse(librevenge::RVNGTextInterface *documentInterface) |
181 | 0 | { |
182 | 0 | std::list<WPXPageSpan> pageList; |
183 | 0 | WPXTableList tableList; |
184 | |
|
185 | 0 | librevenge::RVNGInputStream *input = getInput(); |
186 | 0 | WPXEncryption *encryption = getEncryption(); |
187 | |
|
188 | 0 | std::unique_ptr<WP6PrefixData> prefixData(getPrefixData(input, encryption)); |
189 | | |
190 | | // do a "first-pass" parse of the document |
191 | | // gather table border information, page properties (per-page) |
192 | 0 | WP6StylesListener stylesListener(pageList, tableList); |
193 | 0 | stylesListener.setPrefixData(prefixData.get()); |
194 | 0 | parse(input, encryption, &stylesListener); |
195 | | |
196 | | // postprocess the pageList == remove duplicate page spans due to the page breaks |
197 | 0 | auto previousPage = pageList.begin(); |
198 | 0 | for (auto Iter=pageList.begin(); Iter != pageList.end(); /* Iter++ */) |
199 | 0 | { |
200 | 0 | if ((Iter != previousPage) && ((*previousPage)==(*Iter))) |
201 | 0 | { |
202 | 0 | (*previousPage).setPageSpan((*previousPage).getPageSpan() + (*Iter).getPageSpan()); |
203 | 0 | Iter = pageList.erase(Iter); |
204 | 0 | } |
205 | 0 | else |
206 | 0 | { |
207 | 0 | previousPage = Iter; |
208 | 0 | ++Iter; |
209 | 0 | } |
210 | 0 | } |
211 | | |
212 | | // second pass: here is where we actually send the messages to the target app |
213 | | // that are necessary to emit the body of the target document |
214 | 0 | WP6ContentListener listener(pageList, tableList, documentInterface); |
215 | 0 | listener.setPrefixData(prefixData.get()); |
216 | | |
217 | | // get the relevant initial prefix packets out of storage and tell them to parse |
218 | | // themselves |
219 | 0 | parsePacket(prefixData.get(), WP6_INDEX_HEADER_EXTENDED_DOCUMENT_SUMMARY, &listener); |
220 | 0 | parsePacket(prefixData.get(), WP6_INDEX_HEADER_INITIAL_FONT, &listener); |
221 | 0 | parsePackets(prefixData.get(), WP6_INDEX_HEADER_OUTLINE_STYLE, &listener); |
222 | |
|
223 | 0 | parse(input, encryption, &listener); |
224 | 0 | } |
225 | | |
226 | | void WP6Parser::parseSubDocument(librevenge::RVNGTextInterface *documentInterface) |
227 | 16.5k | { |
228 | 16.5k | std::list<WPXPageSpan> pageList; |
229 | 16.5k | WPXTableList tableList; |
230 | | |
231 | 16.5k | librevenge::RVNGInputStream *input = getInput(); |
232 | | |
233 | 16.5k | try |
234 | 16.5k | { |
235 | 16.5k | WP6StylesListener stylesListener(pageList, tableList); |
236 | 16.5k | stylesListener.startSubDocument(); |
237 | 16.5k | parseDocument(input, nullptr, &stylesListener); |
238 | 16.5k | stylesListener.endSubDocument(); |
239 | | |
240 | 16.5k | input->seek(0, librevenge::RVNG_SEEK_SET); |
241 | | |
242 | 16.5k | WP6ContentListener listener(pageList, tableList, documentInterface); |
243 | 16.5k | listener.startSubDocument(); |
244 | 16.5k | parseDocument(input, nullptr, &listener); |
245 | 16.5k | listener.endSubDocument(); |
246 | 16.5k | } |
247 | 16.5k | catch (FileException) |
248 | 16.5k | { |
249 | 6.51k | WPD_DEBUG_MSG(("WordPerfect: File Exception. Parse terminated prematurely.")); |
250 | 6.51k | throw FileException(); |
251 | 6.51k | } |
252 | 16.5k | } |
253 | | /* vim:set shiftwidth=4 softtabstop=4 noexpandtab: */ |