Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/intl/hyphenation/glue/nsHyphenator.cpp
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2
/* This Source Code Form is subject to the terms of the Mozilla Public
3
 * License, v. 2.0. If a copy of the MPL was not distributed with this
4
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6
#include "nsHyphenator.h"
7
#include "nsIFile.h"
8
#include "nsUTF8Utils.h"
9
#include "nsUnicodeProperties.h"
10
#include "nsIURI.h"
11
12
#include "hyphen.h"
13
14
nsHyphenator::nsHyphenator(nsIURI *aURI)
15
  : mDict(nullptr)
16
0
{
17
0
  nsCString uriSpec;
18
0
  nsresult rv = aURI->GetSpec(uriSpec);
19
0
  if (NS_FAILED(rv)) {
20
0
    return;
21
0
  }
22
0
  mDict = hnj_hyphen_load(uriSpec.get());
23
#ifdef DEBUG
24
  if (mDict) {
25
    printf("loaded hyphenation patterns from %s\n", uriSpec.get());
26
  }
27
#endif
28
}
29
30
nsHyphenator::~nsHyphenator()
31
0
{
32
0
  if (mDict != nullptr) {
33
0
    hnj_hyphen_free((HyphenDict*)mDict);
34
0
    mDict = nullptr;
35
0
  }
36
0
}
37
38
bool
39
nsHyphenator::IsValid()
40
0
{
41
0
  return (mDict != nullptr);
42
0
}
43
44
nsresult
45
nsHyphenator::Hyphenate(const nsAString& aString, nsTArray<bool>& aHyphens)
46
0
{
47
0
  if (!aHyphens.SetLength(aString.Length(), mozilla::fallible)) {
48
0
    return NS_ERROR_OUT_OF_MEMORY;
49
0
  }
50
0
  memset(aHyphens.Elements(), false, aHyphens.Length() * sizeof(bool));
51
0
52
0
  bool inWord = false;
53
0
  uint32_t wordStart = 0, wordLimit = 0;
54
0
  uint32_t chLen;
55
0
  for (uint32_t i = 0; i < aString.Length(); i += chLen) {
56
0
    uint32_t ch = aString[i];
57
0
    chLen = 1;
58
0
59
0
    if (NS_IS_HIGH_SURROGATE(ch)) {
60
0
      if (i + 1 < aString.Length() && NS_IS_LOW_SURROGATE(aString[i+1])) {
61
0
        ch = SURROGATE_TO_UCS4(ch, aString[i+1]);
62
0
        chLen = 2;
63
0
      } else {
64
0
        NS_WARNING("unpaired surrogate found during hyphenation");
65
0
      }
66
0
    }
67
0
68
0
    nsUGenCategory cat = mozilla::unicode::GetGenCategory(ch);
69
0
    if (cat == nsUGenCategory::kLetter || cat == nsUGenCategory::kMark) {
70
0
      if (!inWord) {
71
0
        inWord = true;
72
0
        wordStart = i;
73
0
      }
74
0
      wordLimit = i + chLen;
75
0
      if (i + chLen < aString.Length()) {
76
0
        continue;
77
0
      }
78
0
    }
79
0
80
0
    if (inWord) {
81
0
      // Convert the word to utf-8 for libhyphen, lowercasing it as we go
82
0
      // so that it will match the (lowercased) patterns (bug 1105644).
83
0
      nsAutoCString utf8;
84
0
      const char16_t* const begin = aString.BeginReading();
85
0
      const char16_t *cur = begin + wordStart;
86
0
      const char16_t *end = begin + wordLimit;
87
0
      while (cur < end) {
88
0
        uint32_t ch = *cur++;
89
0
90
0
        if (NS_IS_HIGH_SURROGATE(ch)) {
91
0
          if (cur < end && NS_IS_LOW_SURROGATE(*cur)) {
92
0
            ch = SURROGATE_TO_UCS4(ch, *cur++);
93
0
          } else {
94
0
            ch = 0xfffd; // unpaired surrogate, treat as REPLACEMENT CHAR
95
0
          }
96
0
        } else if (NS_IS_LOW_SURROGATE(ch)) {
97
0
          ch = 0xfffd; // unpaired surrogate
98
0
        }
99
0
100
0
        // XXX What about language-specific casing? Consider Turkish I/i...
101
0
        // In practice, it looks like the current patterns will not be
102
0
        // affected by this, as they treat dotted and undotted i similarly.
103
0
        ch = ToLowerCase(ch);
104
0
105
0
        if (ch < 0x80) { // U+0000 - U+007F
106
0
          utf8.Append(ch);
107
0
        } else if (ch < 0x0800) { // U+0100 - U+07FF
108
0
          utf8.Append(0xC0 | (ch >> 6));
109
0
          utf8.Append(0x80 | (0x003F & ch));
110
0
        } else if (ch < 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF
111
0
          utf8.Append(0xE0 | (ch >> 12));
112
0
          utf8.Append(0x80 | (0x003F & (ch >> 6)));
113
0
          utf8.Append(0x80 | (0x003F & ch));
114
0
        } else {
115
0
          utf8.Append(0xF0 | (ch >> 18));
116
0
          utf8.Append(0x80 | (0x003F & (ch >> 12)));
117
0
          utf8.Append(0x80 | (0x003F & (ch >> 6)));
118
0
          utf8.Append(0x80 | (0x003F & ch));
119
0
        }
120
0
      }
121
0
122
0
      AutoTArray<char,200> utf8hyphens;
123
0
      utf8hyphens.SetLength(utf8.Length() + 5);
124
0
      char **rep = nullptr;
125
0
      int *pos = nullptr;
126
0
      int *cut = nullptr;
127
0
      int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict,
128
0
                                      utf8.BeginReading(), utf8.Length(),
129
0
                                      utf8hyphens.Elements(), nullptr,
130
0
                                      &rep, &pos, &cut);
131
0
      if (!err) {
132
0
        // Surprisingly, hnj_hyphen_hyphenate2 converts the 'hyphens' buffer
133
0
        // from utf8 code unit indexing (which would match the utf8 input
134
0
        // string directly) to Unicode character indexing.
135
0
        // We then need to convert this to utf16 code unit offsets for Gecko.
136
0
        const char *hyphPtr = utf8hyphens.Elements();
137
0
        const char16_t *cur = begin + wordStart;
138
0
        const char16_t *end = begin + wordLimit;
139
0
        while (cur < end) {
140
0
          if (*hyphPtr & 0x01) {
141
0
            aHyphens[cur - begin] = true;
142
0
          }
143
0
          cur++;
144
0
          if (cur < end && NS_IS_LOW_SURROGATE(*cur) &&
145
0
              NS_IS_HIGH_SURROGATE(*(cur-1)))
146
0
          {
147
0
            cur++;
148
0
          }
149
0
          hyphPtr++;
150
0
        }
151
0
      }
152
0
    }
153
0
154
0
    inWord = false;
155
0
  }
156
0
157
0
  return NS_OK;
158
0
}