Coverage Report

Created: 2025-11-09 06:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/exiv2/xmpsdk/src/XMPMeta-Parse.cpp
Line
Count
Source
1
// =================================================================================================
2
// Copyright 2002-2008 Adobe Systems Incorporated
3
// All Rights Reserved.
4
//
5
// NOTICE:  Adobe permits you to use, modify, and distribute this file in accordance with the terms
6
// of the Adobe license agreement accompanying it.
7
//
8
// Adobe patent application tracking #P435, entitled 'Unique markers to simplify embedding data of
9
// one format in a file with a different format', inventors: Sean Parent, Greg Gilley.
10
// =================================================================================================
11
12
#include "XMP_Environment.h"  // ! This must be the first include!
13
#include "XMPCore_Impl.hpp"
14
15
#include "XMPMeta.hpp"
16
#include "XMPUtils.hpp"
17
18
#include "UnicodeInlines.incl_cpp"
19
#include "UnicodeConversions.hpp"
20
#include "ExpatAdapter.hpp"
21
22
#if XMP_DebugBuild
23
  #include <iostream>
24
#endif
25
26
using namespace std;
27
28
#if XMP_WinBuild
29
#ifdef _MSC_VER
30
  #pragma warning ( disable : 4533 )  // initialization of '...' is skipped by 'goto ...'
31
  #pragma warning ( disable : 4702 )  // unreachable code
32
  #pragma warning ( disable : 4800 )  // forcing value to bool 'true' or 'false' (performance warning)
33
  #pragma warning ( disable : 4996 )  // '...' was declared deprecated
34
#endif
35
#endif
36
37
38
// *** Use the XMP_PropIsXyz (Schema, Simple, Struct, Array, ...) macros
39
// *** Add debug codegen checks, e.g. that typical masking operations really work
40
// *** Change all uses of strcmp and strncmp to XMP_LitMatch and XMP_LitNMatch
41
42
43
// =================================================================================================
44
// Local Types and Constants
45
// =========================
46
47
48
// =================================================================================================
49
// Static Variables
50
// ================
51
52
#ifndef Trace_ParsingHackery
53
  #define Trace_ParsingHackery 0
54
#endif
55
56
static const char * kReplaceLatin1[128] =
57
  {
58
59
    // The 0x80..0x9F range is undefined in Latin-1, but is defined in Windows code page 1252.
60
    // The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are formally undefined by Windows 1252, but
61
    // their conversion API maps them to U+0081, etc. These are in XML's RestrictedChar set, so
62
    // we map them to a space.
63
    
64
    "\xE2\x82\xAC", " ",            "\xE2\x80\x9A", "\xC6\x92",   // 0x80 .. 0x83
65
    "\xE2\x80\x9E", "\xE2\x80\xA6", "\xE2\x80\xA0", "\xE2\x80\xA1", // 0x84 .. 0x87
66
    "\xCB\x86",     "\xE2\x80\xB0", "\xC5\xA0",     "\xE2\x80\xB9", // 0x88 .. 0x8B
67
    "\xC5\x92",     " ",            "\xC5\xBD",     " ",      // 0x8C .. 0x8F
68
69
    " ",            "\xE2\x80\x98", "\xE2\x80\x99", "\xE2\x80\x9C", // 0x90 .. 0x93
70
    "\xE2\x80\x9D", "\xE2\x80\xA2", "\xE2\x80\x93", "\xE2\x80\x94", // 0x94 .. 0x97
71
    "\xCB\x9C",     "\xE2\x84\xA2", "\xC5\xA1",     "\xE2\x80\xBA", // 0x98 .. 0x9B
72
    "\xC5\x93",     " ",            "\xC5\xBE",     "\xC5\xB8",   // 0x9C .. 0x9F
73
74
    // These are the UTF-8 forms of the official Latin-1 characters in the range 0xA0..0xFF. Not
75
    // too surprisingly these map to U+00A0, etc. Which is the Unicode Latin Supplement range.
76
    
77
    "\xC2\xA0", "\xC2\xA1", "\xC2\xA2", "\xC2\xA3", "\xC2\xA4", "\xC2\xA5", "\xC2\xA6", "\xC2\xA7", // 0xA0 .. 0xA7
78
    "\xC2\xA8", "\xC2\xA9", "\xC2\xAA", "\xC2\xAB", "\xC2\xAC", "\xC2\xAD", "\xC2\xAE", "\xC2\xAF", // 0xA8 .. 0xAF
79
80
    "\xC2\xB0", "\xC2\xB1", "\xC2\xB2", "\xC2\xB3", "\xC2\xB4", "\xC2\xB5", "\xC2\xB6", "\xC2\xB7", // 0xB0 .. 0xB7
81
    "\xC2\xB8", "\xC2\xB9", "\xC2\xBA", "\xC2\xBB", "\xC2\xBC", "\xC2\xBD", "\xC2\xBE", "\xC2\xBF", // 0xB8 .. 0xBF
82
83
    "\xC3\x80", "\xC3\x81", "\xC3\x82", "\xC3\x83", "\xC3\x84", "\xC3\x85", "\xC3\x86", "\xC3\x87", // 0xC0 .. 0xC7
84
    "\xC3\x88", "\xC3\x89", "\xC3\x8A", "\xC3\x8B", "\xC3\x8C", "\xC3\x8D", "\xC3\x8E", "\xC3\x8F", // 0xC8 .. 0xCF
85
86
    "\xC3\x90", "\xC3\x91", "\xC3\x92", "\xC3\x93", "\xC3\x94", "\xC3\x95", "\xC3\x96", "\xC3\x97", // 0xD0 .. 0xD7
87
    "\xC3\x98", "\xC3\x99", "\xC3\x9A", "\xC3\x9B", "\xC3\x9C", "\xC3\x9D", "\xC3\x9E", "\xC3\x9F", // 0xD8 .. 0xDF
88
89
    "\xC3\xA0", "\xC3\xA1", "\xC3\xA2", "\xC3\xA3", "\xC3\xA4", "\xC3\xA5", "\xC3\xA6", "\xC3\xA7", // 0xE0 .. 0xE7
90
    "\xC3\xA8", "\xC3\xA9", "\xC3\xAA", "\xC3\xAB", "\xC3\xAC", "\xC3\xAD", "\xC3\xAE", "\xC3\xAF", // 0xE8 .. 0xEF
91
92
    "\xC3\xB0", "\xC3\xB1", "\xC3\xB2", "\xC3\xB3", "\xC3\xB4", "\xC3\xB5", "\xC3\xB6", "\xC3\xB7", // 0xF0 .. 0xF7
93
    "\xC3\xB8", "\xC3\xB9", "\xC3\xBA", "\xC3\xBB", "\xC3\xBC", "\xC3\xBD", "\xC3\xBE", "\xC3\xBF", // 0xF8 .. 0xFF
94
95
  };
96
97
98
// =================================================================================================
99
// Local Utilities
100
// ===============
101
102
103
#define IsHexDigit(ch)    ( (('0' <= (ch)) && ((ch) <= '9')) || (('A' <= (ch)) && ((ch) <= 'F')) )
104
#define HexDigitValue(ch) ( (((ch) - '0') < 10) ? ((ch) - '0') : ((ch) - 'A' + 10) )
105
106
107
// -------------------------------------------------------------------------------------------------
108
// PickBestRoot
109
// ------------
110
static const XML_Node * PickBestRoot ( const XML_Node & xmlParent, XMP_OptionBits options )
111
2.05k
{
112
113
  // Look among this parent's content for x:xmpmeta. The recursion for x:xmpmeta is broader than
114
  // the strictly defined choice, but gives us smaller code.
115
10.2k
  for ( size_t childNum = 0, childLim = xmlParent.content.size(); childNum < childLim; ++childNum ) {
116
8.36k
    const XML_Node * childNode = xmlParent.content[childNum];
117
8.36k
    if ( childNode->kind != kElemNode ) continue;
118
529
    if ( (childNode->name == "x:xmpmeta") || (childNode->name == "x:xapmeta") ) return PickBestRoot ( *childNode, 0 );
119
529
  }
120
  // Look among this parent's content for a bare rdf:RDF if that is allowed.
121
1.90k
  if ( ! (options & kXMP_RequireXMPMeta) ) {
122
8.98k
    for ( size_t childNum = 0, childLim = xmlParent.content.size(); childNum < childLim; ++childNum ) {
123
7.17k
      const XML_Node * childNode = xmlParent.content[childNum];
124
7.17k
      if ( childNode->kind != kElemNode ) continue;
125
305
      if ( childNode->name == "rdf:RDF" ) return childNode;
126
305
    }
127
1.90k
  }
128
  
129
  // Recurse into the content.
130
3.58k
  for ( size_t childNum = 0, childLim = xmlParent.content.size(); childNum < childLim; ++childNum ) {
131
1.79k
    const XML_Node * foundRoot = PickBestRoot ( *xmlParent.content[childNum], options );
132
1.79k
    if ( foundRoot != 0 ) return foundRoot;
133
1.79k
  }
134
  
135
1.78k
  return 0;
136
137
1.80k
}  // PickBestRoot
138
139
// -------------------------------------------------------------------------------------------------
140
// FindRootNode
141
// ------------
142
//
143
// Find the XML node that is the root of the XMP data tree. Generally this will be an outer node,
144
// but it could be anywhere if a general XML document is parsed (e.g. SVG). The XML parser counted
145
// all possible root nodes, and kept a pointer to the last one. If there is more than one possible
146
// root use PickBestRoot to choose among them.
147
//
148
// If there is a root node, try to extract the version of the previous XMP toolkit.
149
150
static const XML_Node * FindRootNode ( XMPMeta * thiz, const XMLParserAdapter & xmlParser, XMP_OptionBits options )
151
5.88k
{
152
5.88k
  const XML_Node * rootNode = xmlParser.rootNode;
153
  
154
5.88k
  if ( xmlParser.rootCount > 1 ) rootNode = PickBestRoot ( xmlParser.tree, options );
155
5.88k
  if ( rootNode == 0 ) return 0;
156
  
157
  // We have a root node. Try to extract previous toolkit version number.
158
  
159
5.44k
  XMP_StringPtr verStr = "";
160
  
161
5.44k
    XMP_Assert ( rootNode->name == "rdf:RDF" );
162
  
163
5.44k
    if ( (options & kXMP_RequireXMPMeta) &&
164
0
         ((rootNode->parent == 0) ||
165
0
          ((rootNode->parent->name != "x:xmpmeta") && (rootNode->parent->name != "x:xapmeta"))) ) return 0;
166
167
6.29k
    for ( size_t attrNum = 0, attrLim = rootNode->parent->attrs.size(); attrNum < attrLim; ++attrNum ) {
168
1.89k
      const XML_Node * currAttr =rootNode->parent->attrs[attrNum];
169
1.89k
      if ( (currAttr->name == "x:xmptk") || (currAttr->name == "x:xaptk") ) {
170
1.05k
        verStr = currAttr->value.c_str();
171
1.05k
        break;
172
1.05k
      }
173
1.89k
    }
174
    
175
  // Decode the version number into MMmmuubbb digits. If any part is too big, peg it at 99 or 999.
176
  
177
5.44k
  unsigned long part;
178
33.1k
  while ( (*verStr != 0) && ((*verStr < '0') || (*verStr > '9')) ) ++verStr;
179
  
180
5.44k
  part = 0;
181
8.69k
  while ( (*verStr != 0) && ('0' <= *verStr) && (*verStr <= '9') ) {
182
3.24k
    part = (part * 10) + (*verStr - '0');
183
3.24k
    ++verStr;
184
3.24k
  }
185
5.44k
  if ( part > 99 ) part = 99;
186
5.44k
  thiz->prevTkVer = part * 100*100*1000;
187
  
188
5.44k
  part = 0;
189
5.44k
  if ( *verStr == '.' ) ++verStr;
190
7.53k
  while ( (*verStr != 0) && ('0' <= *verStr) && (*verStr <= '9') ) {
191
2.08k
    part = (part * 10) + (*verStr - '0');
192
2.08k
    ++verStr;
193
2.08k
  }
194
5.44k
  if ( part > 99 ) part = 99;
195
5.44k
  thiz->prevTkVer += part * 100*1000;
196
  
197
5.44k
  part = 0;
198
5.44k
  if ( *verStr == '.' ) ++verStr;
199
7.73k
  while ( (*verStr != 0) && ('0' <= *verStr) && (*verStr <= '9') ) {
200
2.28k
    part = (part * 10) + (*verStr - '0');
201
2.28k
    ++verStr;
202
2.28k
  }
203
5.44k
  if ( part > 99 ) part = 99;
204
5.44k
  thiz->prevTkVer += part * 1000;
205
  
206
5.44k
  part = 0;
207
5.44k
  if ( *verStr == '-' ) ++verStr;
208
8.20k
  while ( (*verStr != 0) && ('0' <= *verStr) && (*verStr <= '9') ) {
209
2.75k
    part = (part * 10) + (*verStr - '0');
210
2.75k
    ++verStr;
211
2.75k
  }
212
5.44k
  if ( part > 999 ) part = 999;
213
5.44k
  thiz->prevTkVer += part;
214
  
215
5.44k
  return rootNode;
216
  
217
5.44k
}  // FindRootNode
218
219
// -------------------------------------------------------------------------------------------------
220
// NormalizeDCArrays
221
// -----------------
222
//
223
// Undo the denormalization performed by the XMP used in Acrobat 5. If a Dublin Core array had only
224
// one item, it was serialized as a simple property. The xml:lang attribute was dropped from an
225
// alt-text item if the language was x-default.
226
227
// *** This depends on the dc: namespace prefix.
228
229
static void
230
NormalizeDCArrays ( XMP_Node * xmpTree )
231
4.04k
{
232
4.04k
  XMP_Node * dcSchema = FindSchemaNode ( xmpTree, kXMP_NS_DC, kXMP_ExistingOnly );
233
4.04k
  if ( dcSchema == 0 ) return;
234
  
235
605
  for ( size_t propNum = 0, propLimit = dcSchema->children.size(); propNum < propLimit; ++propNum ) {
236
437
    XMP_Node *     currProp  = dcSchema->children[propNum];
237
437
    XMP_OptionBits arrayForm = 0;
238
    
239
437
    if ( ! XMP_PropIsSimple ( currProp->options ) ) continue;  // Nothing to do if not simple.
240
    
241
332
    if ( (currProp->name == "dc:creator" )     || // See if it is supposed to be an array.
242
308
         (currProp->name == "dc:date" ) ) {     // *** Think about an array of char* and a loop.
243
24
      arrayForm = kXMP_PropArrayIsOrdered;
244
308
    } else if (
245
308
         (currProp->name == "dc:description" ) ||
246
305
         (currProp->name == "dc:rights" )      ||
247
305
         (currProp->name == "dc:title" ) ) {
248
3
      arrayForm = kXMP_PropArrayIsAltText;
249
305
    } else if (
250
305
         (currProp->name == "dc:contributor" ) ||
251
303
         (currProp->name == "dc:language" )    ||
252
299
         (currProp->name == "dc:publisher" )   ||
253
286
         (currProp->name == "dc:relation" )    ||
254
286
         (currProp->name == "dc:subject" )     ||
255
283
         (currProp->name == "dc:type" ) ) {
256
22
      arrayForm = kXMP_PropValueIsArray;
257
22
    }
258
332
    if ( arrayForm == 0 ) continue;  // Nothing to do if it isn't supposed to be an array.
259
    
260
49
    arrayForm = VerifySetOptions ( arrayForm, 0 );  // Set the implicit array bits.
261
49
    XMP_Node * newArray = new XMP_Node ( dcSchema, currProp->name.c_str(), arrayForm );
262
49
    dcSchema->children[propNum] = newArray;
263
49
    newArray->children.push_back ( currProp );
264
49
    currProp->parent = newArray;
265
49
    currProp->name = kXMP_ArrayItemName;
266
    
267
49
    if ( XMP_ArrayIsAltText ( arrayForm ) && (! (currProp->options & kXMP_PropHasLang)) ) {
268
3
      XMP_Node * newLang = new XMP_Node ( currProp, "xml:lang", "x-default", kXMP_PropIsQualifier );
269
3
      currProp->options |= (kXMP_PropHasQualifiers | kXMP_PropHasLang);
270
3
      if ( currProp->qualifiers.empty() ) { // *** Need a util?
271
3
        currProp->qualifiers.push_back ( newLang );
272
3
      } else {
273
0
        currProp->qualifiers.insert ( currProp->qualifiers.begin(), newLang );
274
0
      }
275
3
    }
276
277
49
  }
278
  
279
168
}  // NormalizeDCArrays
280
281
282
// -------------------------------------------------------------------------------------------------
283
// CompareAliasedSubtrees
284
// ----------------------
285
286
// *** Change to do some alias-specific setup, then use CompareSubtrees. One special case for
287
// *** aliases is a simple to x-default alias, the options and qualifiers obviously differ.
288
289
static void
290
CompareAliasedSubtrees ( XMP_Node * aliasNode, XMP_Node * baseNode, bool outerCall = true )
291
0
{
292
  // ! The outermost call is special. The names almost certainly differ. The qualifiers (and
293
  // ! hence options) will differ for an alias to the x-default item of a langAlt array.
294
0
  if ( (aliasNode->value != baseNode->value) ||
295
0
       (aliasNode->children.size() != baseNode->children.size()) ) {
296
0
    XMP_Throw ( "Mismatch between alias and base nodes", kXMPErr_BadXMP );
297
0
  }
298
0
  if ( ! outerCall ) {
299
0
    if ( (aliasNode->name != baseNode->name) ||
300
0
         (aliasNode->options != baseNode->options) ||
301
0
         (aliasNode->qualifiers.size() != baseNode->qualifiers.size()) ) {
302
0
      XMP_Throw ( "Mismatch between alias and base nodes", kXMPErr_BadXMP );
303
0
    }
304
0
  }
305
  
306
0
  for ( size_t childNum = 0, childLim = aliasNode->children.size(); childNum < childLim; ++childNum ) {
307
0
    XMP_Node * aliasChild = aliasNode->children[childNum];
308
0
    XMP_Node * baseChild  = baseNode->children[childNum];
309
0
    CompareAliasedSubtrees ( aliasChild, baseChild, false );
310
0
  }
311
  
312
0
  for ( size_t qualNum = 0, qualLim = aliasNode->qualifiers.size(); qualNum < qualLim; ++qualNum ) {
313
0
    XMP_Node * aliasQual = aliasNode->qualifiers[qualNum];
314
0
    XMP_Node * baseQual  = baseNode->qualifiers[qualNum];
315
0
    CompareAliasedSubtrees ( aliasQual, baseQual, false );
316
0
  }
317
  
318
0
}  // CompareAliasedSubtrees
319
320
321
// -------------------------------------------------------------------------------------------------
322
// TransplantArrayItemAlias
323
// ------------------------
324
325
static void
326
TransplantArrayItemAlias ( XMP_Node * oldParent, size_t oldNum, XMP_Node * newParent )
327
0
{
328
0
  XMP_Node * childNode = oldParent->children[oldNum];
329
330
0
  if ( newParent->options & kXMP_PropArrayIsAltText ) {
331
0
    if ( childNode->options & kXMP_PropHasLang ) {
332
0
      XMP_Throw ( "Alias to x-default already has a language qualifier", kXMPErr_BadXMP ); // *** Allow x-default.
333
0
    }
334
0
    childNode->options |= (kXMP_PropHasQualifiers | kXMP_PropHasLang);
335
0
    XMP_Node * langQual = new XMP_Node ( childNode, "xml:lang", "x-default", kXMP_PropIsQualifier );  // *** AddLangQual util?
336
0
    if ( childNode->qualifiers.empty() ) {
337
0
      childNode->qualifiers.push_back ( langQual );
338
0
    } else {
339
0
      childNode->qualifiers.insert ( childNode->qualifiers.begin(), langQual );
340
0
    }
341
0
  }
342
343
0
  oldParent->children.erase ( oldParent->children.begin() + oldNum );
344
0
  childNode->name = kXMP_ArrayItemName;
345
0
  childNode->parent = newParent;
346
0
  if ( newParent->children.empty() ) {
347
0
    newParent->children.push_back ( childNode );
348
0
  } else {
349
0
    newParent->children.insert ( newParent->children.begin(), childNode );
350
0
  }
351
352
0
}  // TransplantArrayItemAlias
353
354
355
// -------------------------------------------------------------------------------------------------
356
// TransplantNamedAlias
357
// --------------------
358
359
static void
360
TransplantNamedAlias ( XMP_Node * oldParent, size_t oldNum, XMP_Node * newParent, XMP_VarString & newName )
361
0
{
362
0
  XMP_Node * childNode = oldParent->children[oldNum];
363
364
0
  oldParent->children.erase ( oldParent->children.begin() + oldNum );
365
0
  childNode->name = newName;
366
0
  childNode->parent = newParent;
367
0
  newParent->children.push_back ( childNode );
368
369
0
}  // TransplantNamedAlias
370
371
372
// -------------------------------------------------------------------------------------------------
373
// MoveExplicitAliases
374
// -------------------
375
376
static void
377
MoveExplicitAliases ( XMP_Node * tree, XMP_OptionBits parseOptions )
378
0
{
379
0
  tree->options ^= kXMP_PropHasAliases;
380
0
  const bool strictAliasing = ((parseOptions & kXMP_StrictAliasing) != 0);
381
  
382
  // Visit all of the top level nodes looking for aliases. If there is no base, transplant the
383
  // alias subtree. If there is a base and strict aliasing is on, make sure the alias and base 
384
  // subtrees match.
385
  
386
  // ! Use "while" loops not "for" loops since both the schema and property loops can remove the
387
  // ! current item from the vector being traversed. And don't increment the counter for a delete.
388
  
389
0
  size_t schemaNum = 0;
390
0
  while ( schemaNum < tree->children.size() ) {
391
0
    XMP_Node * currSchema = tree->children[schemaNum];
392
    
393
0
    size_t propNum = 0;
394
0
    while ( propNum < currSchema->children.size() ) {
395
0
      XMP_Node * currProp = currSchema->children[propNum];
396
0
      if ( ! (currProp->options & kXMP_PropIsAlias) ) {
397
0
        ++propNum;
398
0
        continue;
399
0
      }
400
0
      currProp->options ^= kXMP_PropIsAlias;
401
402
      // Find the base path, look for the base schema and root node.
403
404
0
      XMP_AliasMapPos aliasPos = sRegisteredAliasMap->find ( currProp->name );
405
0
      XMP_Assert ( aliasPos != sRegisteredAliasMap->end() );
406
0
      XMP_ExpandedXPath & basePath = aliasPos->second;
407
0
      XMP_OptionBits arrayOptions = (basePath[kRootPropStep].options & kXMP_PropArrayFormMask);
408
409
0
      XMP_Node * baseSchema = FindSchemaNode ( tree, basePath[kSchemaStep].step.c_str(), kXMP_CreateNodes );
410
0
      if ( baseSchema->options & kXMP_NewImplicitNode ) baseSchema->options ^= kXMP_NewImplicitNode;
411
0
      XMP_Node * baseNode = FindChildNode ( baseSchema, basePath[kRootPropStep].step.c_str(), kXMP_ExistingOnly );
412
413
0
      if ( baseNode == 0 ) {
414
      
415
0
        if ( basePath.size() == 2 ) {
416
          // A top-to-top alias, transplant the property.
417
0
          TransplantNamedAlias ( currSchema, propNum, baseSchema, basePath[kRootPropStep].step );
418
0
        } else {
419
          // An alias to an array item, create the array and transplant the property.
420
0
          baseNode = new XMP_Node ( baseSchema, basePath[kRootPropStep].step.c_str(), arrayOptions );
421
0
          baseSchema->children.push_back ( baseNode );
422
0
          TransplantArrayItemAlias ( currSchema, propNum, baseNode );
423
0
        }
424
      
425
0
      } else if ( basePath.size() == 2 ) {
426
      
427
        // The base node does exist and this is a top-to-top alias. Check for conflicts if
428
        // strict aliasing is on. Remove and delete the alias subtree.
429
0
        if ( strictAliasing ) CompareAliasedSubtrees ( currProp, baseNode );
430
0
        currSchema->children.erase ( currSchema->children.begin() + propNum );
431
0
        delete currProp;
432
      
433
0
      } else {
434
      
435
        // This is an alias to an array item and the array exists. Look for the aliased item.
436
        // Then transplant or check & delete as appropriate.
437
        
438
0
        XMP_Node * itemNode = 0;
439
0
        if ( arrayOptions & kXMP_PropArrayIsAltText ) {
440
0
          XMP_Index xdIndex = LookupLangItem ( baseNode, *xdefaultName );
441
0
          if ( xdIndex != -1 ) itemNode = baseNode->children[xdIndex];
442
0
        } else if ( ! baseNode->children.empty() ) {
443
0
          itemNode = baseNode->children[0];
444
0
        }
445
        
446
0
        if ( itemNode == 0 ) {
447
0
          TransplantArrayItemAlias ( currSchema, propNum, baseNode );
448
0
        } else {
449
0
          if ( strictAliasing ) CompareAliasedSubtrees ( currProp, itemNode );
450
0
          currSchema->children.erase ( currSchema->children.begin() + propNum );
451
0
          delete currProp;
452
0
        }
453
454
0
      }
455
456
0
    } // Property loop
457
    
458
    // Increment the counter or remove an empty schema node.
459
0
    if ( currSchema->children.size() > 0 ) {
460
0
      ++schemaNum;
461
0
    } else {
462
0
      delete tree->children[schemaNum]; // ! Delete the schema node itself.
463
0
      tree->children.erase ( tree->children.begin() + schemaNum );
464
0
    }
465
    
466
0
  } // Schema loop
467
  
468
0
}  // MoveExplicitAliases
469
470
471
// -------------------------------------------------------------------------------------------------
472
// FixGPSTimeStamp
473
// ---------------
474
475
static void
476
FixGPSTimeStamp ( XMP_Node * exifSchema, XMP_Node * gpsDateTime )
477
0
{
478
0
  XMP_DateTime binGPSStamp;
479
0
  try {
480
0
    XMPUtils::ConvertToDate ( gpsDateTime->value.c_str(), &binGPSStamp );
481
0
  } catch ( ... ) {
482
0
    return; // Don't let a bad date stop other things.
483
0
  }
484
0
  if ( (binGPSStamp.year != 0) || (binGPSStamp.month != 0) || (binGPSStamp.day != 0) ) return;
485
  
486
0
  XMP_Node * otherDate = FindChildNode ( exifSchema, "exif:DateTimeOriginal", kXMP_ExistingOnly );
487
0
  if ( otherDate == 0 ) otherDate = FindChildNode ( exifSchema, "exif:DateTimeDigitized", kXMP_ExistingOnly );
488
0
  if ( otherDate == 0 ) return;
489
490
0
  XMP_DateTime binOtherDate;
491
0
  try {
492
0
    XMPUtils::ConvertToDate ( otherDate->value.c_str(), &binOtherDate );
493
0
  } catch ( ... ) {
494
0
    return; // Don't let a bad date stop other things.
495
0
  }
496
  
497
0
  binGPSStamp.year  = binOtherDate.year;
498
0
  binGPSStamp.month = binOtherDate.month;
499
0
  binGPSStamp.day   = binOtherDate.day;
500
501
0
  XMP_StringPtr goodStr;
502
0
  XMP_StringLen goodLen;
503
0
  XMPUtils::ConvertFromDate ( binGPSStamp, &goodStr, &goodLen );
504
  
505
0
  gpsDateTime->value.assign ( goodStr, goodLen );
506
507
0
}  // FixGPSTimeStamp
508
509
510
// -------------------------------------------------------------------------------------------------
511
// MigrateAudioCopyright
512
// ---------------------
513
//
514
// The initial support for WAV files mapped a legacy ID3 audio copyright into a new xmpDM:copyright
515
// property. This is special case code to migrate that into dc:rights['x-default']. The rules:
516
//
517
//   1. If there is no dc:rights array, or an empty array -
518
//      Create one with dc:rights['x-default'] set from double linefeed and xmpDM:copyright.
519
//
520
//   2. If there is a dc:rights array but it has no x-default item -
521
//      Create an x-default item as a copy of the first item then apply rule #3.
522
//
523
//   3. If there is a dc:rights array with an x-default item, look for a double linefeed in the value.
524
//      A. If no double linefeed, compare the x-default value to the xmpDM:copyright value.
525
//         A1. If they match then leave the x-default value alone.
526
//         A2. Otherwise, append a double linefeed and the xmpDM:copyright value to the x-default value.
527
//      B. If there is a double linefeed, compare the trailing text to the xmpDM:copyright value.
528
//         B1. If they match then leave the x-default value alone.
529
//         B2. Otherwise, replace the trailing x-default text with the xmpDM:copyright value.
530
//
531
//   4. In all cases, delete the xmpDM:copyright property.
532
533
static void
534
MigrateAudioCopyright ( XMPMeta * xmp, XMP_Node * dmCopyright )
535
0
{
536
537
0
  try {
538
  
539
0
    std::string & dmValue = dmCopyright->value;
540
0
    static const char * kDoubleLF = "\xA\xA";
541
    
542
0
    XMP_Node * dcSchema = FindSchemaNode ( &xmp->tree, kXMP_NS_DC, kXMP_CreateNodes );
543
0
    XMP_Node * dcRightsArray = FindChildNode ( dcSchema, "dc:rights", kXMP_ExistingOnly );
544
    
545
0
    if ( (dcRightsArray == 0) || dcRightsArray->children.empty() ) {
546
    
547
      // 1. No dc:rights array, create from double linefeed and xmpDM:copyright.
548
0
      dmValue.insert ( 0, kDoubleLF );
549
0
      xmp->SetLocalizedText ( kXMP_NS_DC, "rights", "", "x-default",  dmValue.c_str(), 0 );
550
    
551
0
    } else {
552
553
0
      std::string xdefaultStr ( "x-default" );
554
      
555
0
      XMP_Index xdIndex = LookupLangItem ( dcRightsArray, xdefaultStr );
556
      
557
0
      if ( xdIndex < 0 ) {
558
        // 2. No x-default item, create from the first item.
559
0
        XMP_StringPtr firstValue = dcRightsArray->children[0]->value.c_str();
560
0
        xmp->SetLocalizedText ( kXMP_NS_DC, "rights", "", "x-default",  firstValue, 0 );
561
0
        xdIndex = LookupLangItem ( dcRightsArray, xdefaultStr );
562
0
      }
563
            
564
      // 3. Look for a double linefeed in the x-default value.
565
0
      XMP_Assert ( xdIndex == 0 );
566
0
      std::string & defaultValue = dcRightsArray->children[xdIndex]->value;
567
0
      XMP_Index lfPos = defaultValue.find ( kDoubleLF );
568
      
569
0
      if ( lfPos < 0 ) {
570
      
571
        // 3A. No double LF, compare whole values.
572
0
        if ( dmValue != defaultValue ) {
573
          // 3A2. Append the xmpDM:copyright to the x-default item.
574
0
          defaultValue += kDoubleLF;
575
0
          defaultValue += dmValue;
576
0
        }
577
      
578
0
      } else {
579
      
580
        // 3B. Has double LF, compare the tail.
581
0
        if ( defaultValue.compare ( lfPos+2, std::string::npos, dmValue ) != 0 ) {
582
          // 3B2. Replace the x-default tail.
583
0
          defaultValue.replace ( lfPos+2, std::string::npos, dmValue );
584
0
        }
585
      
586
0
      }
587
588
0
    }
589
    
590
    // 4. Get rid of the xmpDM:copyright.
591
0
    xmp->DeleteProperty ( kXMP_NS_DM, "copyright" );
592
  
593
0
  } catch ( ... ) {
594
    // Don't let failures (like a bad dc:rights form) stop other cleanup.
595
0
  }
596
597
0
}  // MigrateAudioCopyright
598
599
600
// -------------------------------------------------------------------------------------------------
601
// RepairAltText
602
// -------------
603
//
604
// Make sure that the array is well-formed AltText. Each item must be simple and have an xml:lang
605
// qualifier. If repairs are needed, keep simple non-empty items by adding the xml:lang.
606
607
static void
608
RepairAltText ( XMP_Node & tree, XMP_StringPtr schemaNS, XMP_StringPtr arrayName )
609
20.2k
{
610
20.2k
  XMP_Node * schemaNode = FindSchemaNode ( &tree, schemaNS, kXMP_ExistingOnly );
611
20.2k
  if ( schemaNode == 0 ) return;
612
  
613
727
  XMP_Node * arrayNode = FindChildNode ( schemaNode, arrayName, kXMP_ExistingOnly );
614
727
  if ( (arrayNode == 0) || XMP_ArrayIsAltText ( arrayNode->options ) ) return;  // Already OK.
615
  
616
90
  if ( ! XMP_PropIsArray ( arrayNode->options ) ) return;  // ! Not even an array, leave it alone.
617
  // *** Should probably change simple values to LangAlt with 'x-default' item.
618
  
619
85
  arrayNode->options |= (kXMP_PropArrayIsOrdered | kXMP_PropArrayIsAlternate | kXMP_PropArrayIsAltText);
620
  
621
464
  for ( int i = arrayNode->children.size()-1; i >= 0; --i ) { // ! Need a signed index type.
622
623
379
    XMP_Node * currChild = arrayNode->children[i];
624
625
379
    if ( ! XMP_PropIsSimple ( currChild->options ) ) {
626
627
      // Delete non-simple children.
628
13
      delete ( currChild );
629
13
      arrayNode->children.erase ( arrayNode->children.begin() + i );
630
631
366
    } else if ( ! XMP_PropHasLang ( currChild->options ) ) {
632
    
633
280
      if ( currChild->value.empty() ) {
634
635
        // Delete empty valued children that have no xml:lang.
636
11
        delete ( currChild );
637
11
        arrayNode->children.erase ( arrayNode->children.begin() + i );
638
639
269
      } else {
640
641
        // Add an xml:lang qualifier with the value "x-repair".
642
269
        XMP_Node * repairLang = new XMP_Node ( currChild, "xml:lang", "x-repair", kXMP_PropIsQualifier );
643
269
        if ( currChild->qualifiers.empty() ) {
644
269
          currChild->qualifiers.push_back ( repairLang );
645
269
        } else {
646
0
          currChild->qualifiers.insert ( currChild->qualifiers.begin(), repairLang );
647
0
        }
648
269
        currChild->options |= (kXMP_PropHasQualifiers | kXMP_PropHasLang);
649
650
269
      }
651
652
280
    }
653
654
379
  }
655
656
85
}  // RepairAltText
657
658
659
// -------------------------------------------------------------------------------------------------
660
// TouchUpDataModel
661
// ----------------
662
663
static void
664
TouchUpDataModel ( XMPMeta * xmp )
665
4.04k
{
666
4.04k
  XMP_Node & tree = xmp->tree;
667
  
668
  // Do special case touch ups for certain schema.
669
670
4.04k
  XMP_Node * currSchema = 0;
671
672
4.04k
  currSchema = FindSchemaNode ( &tree, kXMP_NS_EXIF, kXMP_ExistingOnly );
673
4.04k
  if ( currSchema != 0 ) {
674
675
    // Do a special case fix for exif:GPSTimeStamp.
676
25
    XMP_Node * gpsDateTime = FindChildNode ( currSchema, "exif:GPSTimeStamp", kXMP_ExistingOnly );
677
25
    if ( gpsDateTime != 0 ) FixGPSTimeStamp ( currSchema, gpsDateTime );
678
  
679
    // *** Should probably have RepairAltText change simple values to LangAlt with 'x-default' item.
680
    // *** For now just do this for exif:UserComment, the one case we know about, late in cycle fix.
681
25
    XMP_Node * userComment = FindChildNode ( currSchema, "exif:UserComment", kXMP_ExistingOnly );
682
25
    if ( (userComment != 0) && XMP_PropIsSimple ( userComment->options ) ) {
683
0
      XMP_Node * newChild = new XMP_Node ( userComment, kXMP_ArrayItemName,
684
0
                         userComment->value.c_str(), userComment->options );
685
0
      newChild->qualifiers.swap ( userComment->qualifiers );
686
0
      if ( ! XMP_PropHasLang ( newChild->options ) ) {
687
0
        XMP_Node * langQual = new XMP_Node ( newChild, "xml:lang", "x-default", kXMP_PropIsQualifier );
688
0
        newChild->qualifiers.insert ( newChild->qualifiers.begin(), langQual );
689
0
        newChild->options |= (kXMP_PropHasQualifiers | kXMP_PropHasLang);
690
0
      }
691
0
      userComment->value.erase();
692
0
      userComment->options = kXMP_PropArrayFormMask;  // ! Happens to have all the right bits.
693
0
      userComment->children.push_back ( newChild );
694
0
    }
695
696
25
  }
697
698
4.04k
  currSchema = FindSchemaNode ( &tree, kXMP_NS_DM, kXMP_ExistingOnly );
699
4.04k
  if ( currSchema != 0 ) {
700
    // Do a special case migration of xmpDM:copyright to dc:rights['x-default']. Do this before
701
    // the dc: touch up since it can affect the dc: schema.
702
0
    XMP_Node * dmCopyright = FindChildNode ( currSchema, "xmpDM:copyright", kXMP_ExistingOnly );
703
0
    if ( dmCopyright != 0 ) MigrateAudioCopyright ( xmp, dmCopyright );
704
0
  }
705
706
4.04k
  currSchema = FindSchemaNode ( &tree, kXMP_NS_DC, kXMP_ExistingOnly );
707
4.04k
  if ( currSchema != 0 ) {
708
    // Do a special case fix for dc:subject, make sure it is an unordered array.
709
168
    XMP_Node * dcSubject = FindChildNode ( currSchema, "dc:subject", kXMP_ExistingOnly );
710
168
    if ( dcSubject != 0 ) {
711
24
                        XMP_OptionBits keepMask = static_cast<XMP_OptionBits>(~(kXMP_PropArrayIsOrdered | kXMP_PropArrayIsAlternate | kXMP_PropArrayIsAltText));
712
24
      dcSubject->options &= keepMask; // Make sure any ordered array bits are clear.
713
24
    }
714
168
  }
715
  
716
  // Fix any broken AltText arrays that we know about.
717
  
718
4.04k
  RepairAltText ( tree, kXMP_NS_DC, "dc:description" ); // ! Note inclusion of prefixes for direct node lookup!
719
4.04k
  RepairAltText ( tree, kXMP_NS_DC, "dc:rights" );
720
4.04k
  RepairAltText ( tree, kXMP_NS_DC, "dc:title" );
721
4.04k
  RepairAltText ( tree, kXMP_NS_XMP_Rights, "xmpRights:UsageTerms" );
722
4.04k
  RepairAltText ( tree, kXMP_NS_EXIF, "exif:UserComment" );
723
  
724
  // Tweak old XMP: Move an instance ID from rdf:about to the xmpMM:InstanceID property. An old
725
  // instance ID usually looks like "uuid:bac965c4-9d87-11d9-9a30-000d936b79c4", plus InDesign
726
  // 3.0 wrote them like "bac965c4-9d87-11d9-9a30-000d936b79c4". If the name looks like a UUID
727
  // simply move it to xmpMM:InstanceID, don't worry about any existing xmpMM:InstanceID. Both
728
  // will only be present when a newer file with the xmpMM:InstanceID property is updated by an
729
  // old app that uses rdf:about.
730
  
731
4.04k
  if ( ! tree.name.empty() ) {
732
733
338
    bool nameIsUUID = false;
734
338
    XMP_StringPtr nameStr = tree.name.c_str();
735
736
338
    if ( XMP_LitNMatch ( nameStr, "uuid:", 5 ) ) {
737
738
32
      nameIsUUID = true;
739
740
306
    } else if ( tree.name.size() == 36 ) {
741
742
180
      nameIsUUID = true;  // ! Assume true, we'll set it to false below if not.
743
4.18k
      for ( int i = 0;  i < 36; ++i ) {
744
4.13k
        char ch = nameStr[i];
745
4.13k
        if ( ch == '-' ) {
746
162
          if ( (i == 8) || (i == 13) || (i == 18) || (i == 23) ) continue;
747
42
          nameIsUUID = false;
748
42
          break;
749
3.97k
        } else {
750
3.97k
          if ( (('0' <= ch) && (ch <= '9')) || (('a' <= ch) && (ch <= 'z')) ) continue;
751
91
          nameIsUUID = false;
752
91
          break;
753
3.97k
        }
754
4.13k
      }
755
756
180
    }
757
    
758
338
    if ( nameIsUUID ) {
759
760
79
      XMP_ExpandedXPath expPath;
761
79
      ExpandXPath ( kXMP_NS_XMP_MM, "InstanceID", &expPath );
762
79
      XMP_Node * idNode = FindNode ( &tree, expPath, kXMP_CreateNodes, 0 );
763
79
      if ( idNode == 0 ) XMP_Throw ( "Failure creating xmpMM:InstanceID", kXMPErr_InternalFailure );
764
765
79
      idNode->options = 0;  // Clobber any existing xmpMM:InstanceID.
766
79
      idNode->value = tree.name;
767
79
      idNode->RemoveChildren();
768
79
      idNode->RemoveQualifiers();
769
770
79
      tree.name.erase();
771
772
79
    }
773
774
338
  }
775
776
4.04k
}  // TouchUpDataModel
777
778
779
// -------------------------------------------------------------------------------------------------
780
// DetermineInputEncoding
781
// ----------------------
782
//
783
// Try to determine the character encoding, making a guess if the input is too short. We make some
784
// simplifying assumtions: the first character must be U+FEFF or ASCII, U+0000 is not allowed. The
785
// XML 1.1 spec is even more strict, UTF-16 XML documents must begin with U+FEFF, and the first
786
// "real" character must be '<'. Ignoring the XML declaration, the first XML character could be '<',
787
// space, tab, CR, or LF.
788
//
789
// The possible input sequences are:
790
//
791
//   Cases with U+FEFF
792
//      EF BB BF -- - UTF-8
793
//      FE FF -- -- - Big endian UTF-16
794
//      00 00 FE FF - Big endian UTF 32
795
//      FF FE 00 00 - Little endian UTF-32
796
//      FF FE -- -- - Little endian UTF-16
797
//
798
//   Cases with ASCII
799
//      nn mm -- -- - UTF-8 -
800
//      00 00 00 nn - Big endian UTF-32
801
//      00 nn -- -- - Big endian UTF-16
802
//      nn 00 00 00 - Little endian UTF-32
803
//      nn 00 -- -- - Little endian UTF-16
804
//
805
// ! We don't check for full patterns, or for errors. We just check enough to determine what the
806
// ! only possible (or reasonable) case would be.
807
808
static XMP_OptionBits
809
DetermineInputEncoding ( const XMP_Uns8 * buffer, size_t length )
810
5.90k
{
811
5.90k
  if ( length < 2 ) return kXMP_EncodeUTF8;
812
  
813
5.90k
  XMP_Uns8 * uniChar = (XMP_Uns8*)buffer; // ! Make sure comparisons are unsigned.
814
  
815
5.90k
  if ( uniChar[0] == 0 ) {
816
  
817
    // These cases are:
818
    //   00 nn -- -- - Big endian UTF-16
819
    //   00 00 00 nn - Big endian UTF-32
820
    //   00 00 FE FF - Big endian UTF 32
821
822
0
    if ( (length < 4) || (uniChar[1] != 0) ) return kXMP_EncodeUTF16Big;
823
0
    return kXMP_EncodeUTF32Big;
824
    
825
5.90k
  } else if ( uniChar[0] < 0x80 ) {
826
  
827
    // These cases are:
828
    //   nn mm -- -- - UTF-8, includes EF BB BF case
829
    //   nn 00 00 00 - Little endian UTF-32
830
    //   nn 00 -- -- - Little endian UTF-16
831
832
5.90k
    if ( uniChar[1] != 0 )  return kXMP_EncodeUTF8;
833
0
    if ( (length < 4) || (uniChar[2] != 0) ) return kXMP_EncodeUTF16Little;
834
0
    return kXMP_EncodeUTF32Little;
835
836
0
  } else {
837
  
838
    // These cases are:
839
    //   EF BB BF -- - UTF-8
840
    //   FE FF -- -- - Big endian UTF-16
841
    //   FF FE 00 00 - Little endian UTF-32
842
    //   FF FE -- -- - Little endian UTF-16
843
844
0
    if ( uniChar[0] == 0xEF ) return kXMP_EncodeUTF8;
845
0
    if ( uniChar[0] == 0xFE ) return kXMP_EncodeUTF16Big;
846
0
    if ( (length < 4) || (uniChar[2] != 0) ) return kXMP_EncodeUTF16Little;
847
0
    return kXMP_EncodeUTF32Little;
848
849
0
  }
850
    
851
5.90k
}  // DetermineInputEncoding
852
853
854
// -------------------------------------------------------------------------------------------------
855
// CountUTF8
856
// ---------
857
//
858
// Look for a valid multi-byte UTF-8 sequence and return its length. Returns 0 for an invalid UTF-8
859
// sequence. Returns a negative value for a partial valid sequence at the end of the buffer.
860
//
861
// The checking is not strict. We simply count the number of high order 1 bits in the first byte,
862
// then look for n-1 following bytes whose high order 2 bits are 1 and 0. We do not check for a
863
// minimal length representation of the codepoint, or that the codepoint is defined by Unicode.
864
865
static int
866
CountUTF8 ( const XMP_Uns8 * charStart, const XMP_Uns8 * bufEnd )
867
63.8k
{
868
63.8k
  XMP_Assert ( charStart < bufEnd );    // Catch this in debug builds.
869
63.8k
  if ( charStart >= bufEnd ) return 0; // Don't run-on in release builds.
870
63.8k
  if ( (*charStart & 0xC0) != 0xC0 ) return 0; // Must have at least 2 high bits set.
871
  
872
63.8k
  int byteCount = 2;
873
63.8k
  XMP_Uns8 firstByte = *charStart;
874
72.7k
  for ( firstByte = firstByte << 2; (firstByte & 0x80) != 0; firstByte = firstByte << 1 ) ++byteCount;
875
  
876
63.8k
  if ( (charStart + byteCount) > bufEnd ) return -byteCount;
877
878
136k
  for ( int i = 1; i < byteCount; ++i ) {
879
72.7k
    if ( (charStart[i] & 0xC0) != 0x80 ) return 0;
880
72.7k
  }
881
  
882
63.8k
  return byteCount;
883
  
884
63.8k
}  // CountUTF8
885
886
887
// -------------------------------------------------------------------------------------------------
888
// CountControlEscape
889
// ------------------
890
//
891
// Look for a numeric escape sequence for a "prohibited" ASCII control character. These are 0x7F,
892
// and the range 0x00..0x1F except for tab/LF/CR. Return 0 if this is definitely not a numeric
893
// escape, the length of the escape if found, or a negative value for a partial escape.
894
895
static int
896
CountControlEscape ( const XMP_Uns8 * escStart, const XMP_Uns8 * bufEnd )
897
171k
{
898
171k
  XMP_Assert ( escStart < bufEnd ); // Catch this in debug builds.
899
171k
  if ( escStart >= bufEnd ) return 0; // Don't run-on in release builds.
900
171k
  XMP_Assert ( *escStart == '&' );
901
  
902
171k
  size_t tailLen = bufEnd - escStart;
903
171k
  if ( tailLen < 5 ) return -1;  // Don't need a more thorough check, we'll catch it on the next pass.
904
  
905
171k
  if ( strncmp ( (char*)escStart, "&#x", 3 ) != 0 ) return 0;
906
  
907
107k
  XMP_Uns8 escValue = 0;
908
107k
  const XMP_Uns8 * escPos = escStart + 3;
909
  
910
107k
  if ( ('0' <= *escPos) && (*escPos <= '9') ) {
911
58.8k
    escValue = *escPos - '0';
912
58.8k
    ++escPos;
913
58.8k
  } else if ( ('A' <= *escPos) && (*escPos <= 'F') ) {
914
6.79k
    escValue = *escPos - 'A' + 10;
915
6.79k
    ++escPos;
916
41.9k
  } else if ( ('a' <= *escPos) && (*escPos <= 'f') ) {
917
15.4k
    escValue = *escPos - 'a' + 10;
918
15.4k
    ++escPos;
919
15.4k
  }
920
  
921
107k
  if ( ('0' <= *escPos) && (*escPos <= '9') ) {
922
31.2k
    escValue = (escValue << 4) + (*escPos - '0');
923
31.2k
    ++escPos;
924
76.2k
  } else if ( ('A' <= *escPos) && (*escPos <= 'F') ) {
925
2.97k
    escValue = (escValue << 4) + (*escPos - 'A' + 10);
926
2.97k
    ++escPos;
927
73.3k
  } else if ( ('a' <= *escPos) && (*escPos <= 'f') ) {
928
5.80k
    escValue = (escValue << 4) + (*escPos - 'a' + 10);
929
5.80k
    ++escPos;
930
5.80k
  }
931
  
932
107k
  if ( escPos == bufEnd ) return -1; // Partial escape.
933
107k
  if ( *escPos != ';' ) return 0;
934
  
935
15.3k
  size_t escLen = escPos - escStart + 1;
936
15.3k
  if ( escLen < 5 ) return 0;  // ! Catch "&#x;".
937
  
938
11.1k
  if ( (escValue == kTab) || (escValue == kLF) || (escValue == kCR) ) return 0;  // An allowed escape.
939
  
940
8.93k
  return escLen; // Found a full "prohibited" numeric escape.
941
  
942
11.1k
}  // CountControlEscape
943
944
945
// -------------------------------------------------------------------------------------------------
946
// ProcessUTF8Portion
947
// ------------------
948
//
949
// Early versions of the XMP spec mentioned allowing ISO Latin-1 input. There are also problems with
950
// some clients placing ASCII control characters within XMP values. This is an XML problem, the XML
951
// spec only allows tab (0x09), LF (0x0A), and CR (0x0D) from the 0x00..0x1F range. As a concession
952
// to this we scan 8-bit input for byte sequences that are not valid UTF-8 or in the 0x00..0x1F
953
// range and replace each byte as follows:
954
//   0x00..0x1F - Replace with a space, except for tab, CR, and LF.
955
//   0x7F       - Replace with a space. This is ASCII Delete, not allowed by ISO Latin-1.
956
//   0x80..0x9F - Replace with the UTF-8 for a corresponding Unicode character.
957
//   0xA0..0XFF - Replace with the UTF-8 for a corresponding Unicode character.
958
//
959
// The 0x80..0x9F range is not defined by Latin-1. But the Windows 1252 code page defines these and
960
// is otherwise the same as Latin-1.
961
//
962
// For at least historical compatibility reasons we also find and replace singly escaped ASCII
963
// control characters. The Expat parser we're using does not allow numeric escapes like "&#x10;".
964
// The XML spec is clear that raw controls are not allowed (in the RestrictedChar set), but it isn't
965
// as clear about numeric escapes for them. At any rate, Expat complains, so we treat the numeric
966
// escapes like raw characters and replace them with a space.
967
//
968
// We check for 1 or 2 hex digits ("&#x9;" or "&#x09;") and upper or lower case ("&#xA;" or "&#xa;").
969
// The full escape sequence is 5 or 6 bytes.
970
971
static size_t
972
ProcessUTF8Portion ( XMLParserAdapter * xmlParser,
973
           const XMP_Uns8 *   buffer,
974
           size_t       length,
975
           bool       last )
976
5.91k
{
977
5.91k
  const XMP_Uns8 * bufEnd = buffer + length;
978
  
979
5.91k
  const XMP_Uns8 * spanEnd;
980
981
  // `buffer` is copied into this std::string. If `buffer` only
982
  // contains valid UTF-8 and no escape characters, then the copy
983
  // will be identical to the original, but invalid characters are
984
  // replaced - usually with a space character.  This std::string was
985
  // added as a performance fix for:
986
  // https://github.com/Exiv2/exiv2/security/advisories/GHSA-w8mv-g8qq-36mj
987
  // Previously, the code was repeatedly calling
988
  // `xmlParser->ParseBuffer()`, which turned out to have quadratic
989
  // complexity, because expat kept reparsing the entire string from
990
  // the beginning.
991
5.91k
  std::string copy;
992
    
993
71.1M
  for ( spanEnd = buffer; spanEnd < bufEnd; ++spanEnd ) {
994
995
71.1M
    if ( (0x20 <= *spanEnd) && (*spanEnd <= 0x7E) && (*spanEnd != '&') ) {
996
69.5M
      copy.push_back(*spanEnd);
997
69.5M
      continue; // A regular ASCII character.
998
69.5M
    }
999
1000
1.62M
    if ( *spanEnd >= 0x80 ) {
1001
    
1002
      // See if this is a multi-byte UTF-8 sequence, or a Latin-1 character to replace.
1003
1004
63.8k
      int uniLen = CountUTF8 ( spanEnd, bufEnd );
1005
1006
63.8k
      if ( uniLen > 0 ) {
1007
1008
        // A valid UTF-8 character, keep it as-is.
1009
63.8k
        copy.append((const char*)spanEnd, uniLen);
1010
63.8k
        spanEnd += uniLen - 1;  // ! The loop increment will put back the +1.
1011
1012
63.8k
      } else if ( (uniLen < 0) && (! last) ) {
1013
1014
        // Have a partial UTF-8 character at the end of the buffer and more input coming.
1015
0
        xmlParser->ParseBuffer ( copy.c_str(), copy.size(), false );
1016
0
        return (spanEnd - buffer);
1017
1018
0
      } else {
1019
1020
        // Not a valid UTF-8 sequence. Replace the first byte with the Latin-1 equivalent.
1021
0
        const char * replacement = kReplaceLatin1 [ *spanEnd - 0x80 ];
1022
0
        copy.append ( replacement );
1023
1024
0
      }
1025
    
1026
1.55M
    } else if ( (*spanEnd < 0x20) || (*spanEnd == 0x7F) ) {
1027
1028
      // Replace ASCII controls other than tab, LF, and CR with a space.
1029
1030
1.38M
      if ( (*spanEnd == kTab) || (*spanEnd == kLF) || (*spanEnd == kCR) ) {
1031
1.36M
        copy.push_back(*spanEnd);
1032
1.36M
        continue;
1033
1.36M
      }
1034
1035
19.3k
      copy.push_back(' ');
1036
    
1037
171k
    } else {
1038
    
1039
      // See if this is a numeric escape sequence for a prohibited ASCII control.
1040
      
1041
171k
      XMP_Assert ( *spanEnd == '&' );
1042
171k
      int escLen = CountControlEscape ( spanEnd, bufEnd );
1043
      
1044
171k
      if ( escLen < 0 ) {
1045
1046
        // Have a partial numeric escape in this buffer, wait for more input.
1047
17
        if ( last ) {
1048
17
          copy.push_back('&');
1049
17
          continue; // No more buffers, not an escape, absorb as normal input.
1050
17
        }
1051
0
        xmlParser->ParseBuffer ( copy.c_str(), copy.size(), false );
1052
0
        return (spanEnd - buffer);
1053
1054
171k
      } else if ( escLen > 0 ) {
1055
1056
        // Have a complete numeric escape to replace.
1057
8.93k
        copy.push_back(' ');
1058
8.93k
        spanEnd = spanEnd + escLen - 1; // ! The loop continuation will increment spanEnd!
1059
1060
162k
      } else {
1061
162k
        copy.push_back('&');
1062
162k
      }
1063
1064
171k
    }
1065
    
1066
1.62M
  }
1067
  
1068
5.91k
  XMP_Assert ( spanEnd == bufEnd );
1069
5.91k
  copy.push_back(' ');
1070
5.91k
  xmlParser->ParseBuffer ( copy.c_str(), copy.size(), true );
1071
5.91k
  return length;
1072
1073
5.91k
}  // ProcessUTF8Portion
1074
1075
1076
// -------------------------------------------------------------------------------------------------
1077
// ParseFromBuffer
1078
// ---------------
1079
//
1080
// Although most clients will probably parse everything in one call, we have a buffered API model
1081
// and need to support even the extreme case of 1 byte at a time parsing. This is considerably
1082
// complicated by some special cases for 8-bit input. Because of this, the first thing we do is
1083
// determine whether the input is 8-bit, UTF-16, or UTF-32.
1084
//
1085
// Both the 8-bit special cases and the encoding determination are easier to do with 8 bytes or more
1086
// of input. The XMLParserAdapter class has a pending-input buffer for this. At the start of parsing
1087
// we (moght) try to fill this buffer before determining the input character encoding. After that,
1088
// we (might) use this buffer with the current input to simplify the logic in Process8BitInput. The
1089
// "(might)" part means that we don't actually use the pending-input buffer unless we have to. In
1090
// particular, the common case of single-buffer parsing won't use it.
1091
1092
void
1093
XMPMeta::ParseFromBuffer ( XMP_StringPtr  buffer,
1094
               XMP_StringLen  xmpSize,
1095
               XMP_OptionBits options )
1096
5.90k
{
1097
5.90k
  if ( (buffer == 0) && (xmpSize != 0) ) XMP_Throw ( "Null parse buffer", kXMPErr_BadParam );
1098
5.90k
  if ( xmpSize == kXMP_UseNullTermination ) xmpSize = strlen ( buffer );
1099
  
1100
5.90k
  const bool lastClientCall = ((options & kXMP_ParseMoreBuffers) == 0); // *** Could use FlagIsSet & FlagIsClear macros.
1101
  
1102
5.90k
  this->tree.ClearNode(); // Make sure the target XMP object is totally empty.
1103
1104
5.90k
  if ( this->xmlParser == 0 ) {
1105
5.90k
    if ( (xmpSize == 0) && lastClientCall ) return; // Tolerate empty parse. Expat complains if there are no XML elements.
1106
5.90k
    this->xmlParser = XMP_NewExpatAdapter();
1107
5.90k
  }
1108
  
1109
5.90k
  XMLParserAdapter& parser = *this->xmlParser;
1110
  
1111
  #if 0 // XMP_DebugBuild
1112
    if ( parser.parseLog != 0 ) {
1113
      char message [200]; // AUDIT: Using sizeof(message) below for snprintf length is safe.
1114
      snprintf ( message, sizeof(message), "<!-- ParseFromBuffer, length = %d, options = %X%s -->", // AUDIT: See above.
1115
             xmpSize, options, (lastClientCall ? " (last)" : "") );
1116
      fwrite ( message, 1, strlen(message), parser.parseLog );
1117
      fflush ( parser.parseLog );
1118
    }
1119
  #endif
1120
    
1121
5.90k
  try { // Cleanup the tree and xmlParser if anything fails.
1122
  
1123
    // Determine the character encoding before doing any real parsing. This is needed to do the
1124
    // 8-bit special processing.
1125
    
1126
5.90k
    if ( parser.charEncoding == XMP_OptionBits(-1) ) {
1127
1128
5.90k
      if ( (parser.pendingCount == 0) && (xmpSize >= kXMLPendingInputMax) ) {
1129
1130
        // This ought to be the common case, the first buffer is big enough.
1131
5.88k
        parser.charEncoding = DetermineInputEncoding ( (XMP_Uns8*)buffer, xmpSize );
1132
1133
5.88k
      } else {
1134
      
1135
        // Try to fill the pendingInput buffer before calling DetermineInputEncoding.
1136
1137
12
        size_t pendingOverlap = kXMLPendingInputMax - parser.pendingCount;
1138
12
        if ( pendingOverlap > xmpSize ) pendingOverlap = xmpSize;
1139
1140
12
        memcpy ( &parser.pendingInput[parser.pendingCount], buffer, pendingOverlap ); // AUDIT: Count is safe.
1141
12
        buffer += pendingOverlap;
1142
12
        xmpSize -= pendingOverlap;
1143
12
        parser.pendingCount += pendingOverlap;
1144
1145
12
        if ( (! lastClientCall) && (parser.pendingCount < kXMLPendingInputMax) ) return;
1146
12
        parser.charEncoding = DetermineInputEncoding ( parser.pendingInput, parser.pendingCount );
1147
        
1148
        #if Trace_ParsingHackery
1149
          fprintf ( stderr, "XMP Character encoding is %d\n", parser.charEncoding );
1150
        #endif
1151
      
1152
12
      }
1153
1154
5.90k
    }
1155
    
1156
    // We have the character encoding. Process UTF-16 and UTF-32 as is. UTF-8 needs special
1157
    // handling to take care of things like ISO Latin-1 or unescaped ASCII controls.
1158
1159
5.90k
    XMP_Assert ( parser.charEncoding != XMP_OptionBits(-1) );
1160
1161
5.90k
    if ( parser.charEncoding != kXMP_EncodeUTF8 ) {
1162
    
1163
0
      if ( parser.pendingCount > 0 ) {
1164
        // Might have pendingInput from the above portion to determine the character encoding.
1165
0
        parser.ParseBuffer ( parser.pendingInput, parser.pendingCount, false );
1166
0
      }
1167
0
      parser.ParseBuffer ( buffer, xmpSize, lastClientCall );
1168
      
1169
5.90k
    } else {
1170
1171
      #if Trace_ParsingHackery
1172
        fprintf ( stderr, "Parsing %d bytes @ %.8X, %s, %d pending, context: %.8s\n",
1173
              xmpSize, buffer, (lastClientCall ? "last" : "not last"), parser.pendingCount, buffer );
1174
      #endif
1175
1176
      // The UTF-8 processing is a bit complex due to the need to tolerate ISO Latin-1 input.
1177
      // This is done by scanning the input for byte sequences that are not valid UTF-8,
1178
      // assuming they are Latin-1 characters in the range 0x80..0xFF. This requires saving a
1179
      // pending input buffer to handle partial UTF-8 sequences at the end of a buffer.
1180
      
1181
5.91k
      while ( parser.pendingCount > 0 ) {
1182
      
1183
        // We've got some leftover input, process it first then continue with the current
1184
        // buffer. Try to fill the pendingInput buffer before parsing further. We use a loop
1185
        // for weird edge cases like a 2 byte input buffer, using 1 byte for pendingInput,
1186
        // then having a partial UTF-8 end and need to absorb more.
1187
        
1188
12
        size_t pendingOverlap = kXMLPendingInputMax - parser.pendingCount;
1189
12
        if ( pendingOverlap > xmpSize ) pendingOverlap = xmpSize;
1190
        
1191
12
        memcpy ( &parser.pendingInput[parser.pendingCount], buffer, pendingOverlap ); // AUDIT: Count is safe.
1192
12
        parser.pendingCount += pendingOverlap;
1193
12
        buffer += pendingOverlap;
1194
12
        xmpSize -= pendingOverlap;
1195
1196
12
        if ( (! lastClientCall) && (parser.pendingCount < kXMLPendingInputMax) ) return;
1197
12
        size_t bytesDone = ProcessUTF8Portion ( &parser, parser.pendingInput, parser.pendingCount, lastClientCall );
1198
12
        size_t bytesLeft = parser.pendingCount - bytesDone;
1199
1200
        #if Trace_ParsingHackery
1201
          fprintf ( stderr, "   ProcessUTF8Portion handled %d pending bytes\n", bytesDone );
1202
        #endif
1203
        
1204
12
        if ( bytesDone == parser.pendingCount ) {
1205
1206
          // Done with all of the pending input, move on to the current buffer.
1207
12
          parser.pendingCount = 0;
1208
1209
12
        } else if ( bytesLeft <= pendingOverlap ) {
1210
1211
          // The leftover pending input all came from the current buffer. Exit this loop.
1212
0
          buffer -= bytesLeft;
1213
0
          xmpSize += bytesLeft;
1214
0
          parser.pendingCount = 0;
1215
1216
0
        } else if ( xmpSize > 0 ) {
1217
1218
          // Pull more of the current buffer into the pending input and try again.
1219
          // Backup by this pass's overlap so the loop entry code runs OK.
1220
0
          parser.pendingCount -= pendingOverlap;
1221
0
          buffer -= pendingOverlap;
1222
0
          xmpSize += pendingOverlap;
1223
1224
0
        } else {
1225
1226
          // There is no more of the current buffer. Wait for more. Partial sequences at
1227
          // the end of the last buffer should be treated as Latin-1 by ProcessUTF8Portion.
1228
0
          XMP_Assert ( ! lastClientCall );
1229
0
          parser.pendingCount = bytesLeft;
1230
0
          memcpy ( &parser.pendingInput[0], &parser.pendingInput[bytesDone], bytesLeft ); // AUDIT: Count is safe.
1231
0
          return;
1232
1233
0
        }
1234
      
1235
12
      }
1236
      
1237
      // Done with the pending input, process the current buffer.
1238
1239
5.90k
      size_t bytesDone = ProcessUTF8Portion ( &parser, (XMP_Uns8*)buffer, xmpSize, lastClientCall );
1240
1241
      #if Trace_ParsingHackery
1242
        fprintf ( stderr, "   ProcessUTF8Portion handled %d additional bytes\n", bytesDone );
1243
      #endif
1244
      
1245
5.90k
      if ( bytesDone < xmpSize ) {
1246
1247
0
        XMP_Assert ( ! lastClientCall );
1248
0
        size_t bytesLeft = xmpSize - bytesDone;
1249
0
        if ( bytesLeft > kXMLPendingInputMax ) XMP_Throw ( "Parser bytesLeft too large", kXMPErr_InternalFailure );
1250
1251
0
        memcpy ( parser.pendingInput, &buffer[bytesDone], bytesLeft );  // AUDIT: Count is safe.
1252
0
        parser.pendingCount = bytesLeft;
1253
0
        return; // Wait for the next buffer.
1254
1255
0
      }
1256
1257
5.90k
    }
1258
    
1259
5.90k
    if ( lastClientCall ) {
1260
    
1261
      #if XMP_DebugBuild && DumpXMLParseTree
1262
        if ( parser.parseLog == 0 ) parser.parseLog = stdout;
1263
        DumpXMLTree ( parser.parseLog, parser.tree, 0 );
1264
      #endif
1265
1266
5.88k
      const XML_Node * xmlRoot = FindRootNode ( this, *this->xmlParser, options );
1267
1268
5.88k
      if ( xmlRoot != 0 ) {
1269
1270
5.44k
        ProcessRDF ( &this->tree, *xmlRoot, options );
1271
5.44k
        NormalizeDCArrays ( &this->tree );
1272
5.44k
        if ( this->tree.options & kXMP_PropHasAliases ) MoveExplicitAliases ( &this->tree, options );
1273
5.44k
        TouchUpDataModel ( this );
1274
        
1275
        // Delete empty schema nodes. Do this last, other cleanup can make empty schema.
1276
5.44k
        size_t schemaNum = 0;
1277
16.3k
        while ( schemaNum < this->tree.children.size() ) {
1278
10.8k
          XMP_Node * currSchema = this->tree.children[schemaNum];
1279
10.8k
          if ( currSchema->children.size() > 0 ) {
1280
10.8k
            ++schemaNum;
1281
10.8k
          } else {
1282
0
            delete this->tree.children[schemaNum];  // ! Delete the schema node itself.
1283
0
            this->tree.children.erase ( this->tree.children.begin() + schemaNum );
1284
0
          }
1285
10.8k
        }
1286
        
1287
5.44k
      }
1288
1289
5.88k
      delete this->xmlParser;
1290
5.88k
      this->xmlParser = 0;
1291
1292
5.88k
    }
1293
    
1294
5.90k
  } catch ( ... ) {
1295
1296
1.41k
    delete this->xmlParser;
1297
1.41k
    this->xmlParser = 0;
1298
1.41k
    prevTkVer = 0;
1299
1.41k
    this->tree.ClearNode();
1300
1.41k
    throw;
1301
1302
1.41k
  }
1303
  
1304
5.90k
}  // ParseFromBuffer
1305
1306
// =================================================================================================