/src/mozilla-central/intl/hyphenation/glue/nsHyphenator.cpp

Source (jump to first uncovered line)
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "nsHyphenator.h"
#include "nsIFile.h"
#include "nsUTF8Utils.h"
#include "nsUnicodeProperties.h"
#include "nsIURI.h"

#include "hyphen.h"

nsHyphenator::nsHyphenator(nsIURI *aURI)
  : mDict(nullptr)
{
  nsCString uriSpec;
  nsresult rv = aURI->GetSpec(uriSpec);
  if (NS_FAILED(rv)) {
    return;
  }
  mDict = hnj_hyphen_load(uriSpec.get());
#ifdef DEBUG
  if (mDict) {
    printf("loaded hyphenation patterns from %s\n", uriSpec.get());
  }
#endif
}

nsHyphenator::~nsHyphenator()
{
  if (mDict != nullptr) {
    hnj_hyphen_free((HyphenDict*)mDict);
    mDict = nullptr;
  }
}

bool
nsHyphenator::IsValid()
{
  return (mDict != nullptr);
}

nsresult
nsHyphenator::Hyphenate(const nsAString& aString, nsTArray<bool>& aHyphens)
{
  if (!aHyphens.SetLength(aString.Length(), mozilla::fallible)) {
    return NS_ERROR_OUT_OF_MEMORY;
  }
  memset(aHyphens.Elements(), false, aHyphens.Length() * sizeof(bool));

  bool inWord = false;
  uint32_t wordStart = 0, wordLimit = 0;
  uint32_t chLen;
  for (uint32_t i = 0; i < aString.Length(); i += chLen) {
    uint32_t ch = aString[i];
    chLen = 1;

    if (NS_IS_HIGH_SURROGATE(ch)) {
      if (i + 1 < aString.Length() && NS_IS_LOW_SURROGATE(aString[i+1])) {
        ch = SURROGATE_TO_UCS4(ch, aString[i+1]);
        chLen = 2;
      } else {
        NS_WARNING("unpaired surrogate found during hyphenation");
      }
    }

    nsUGenCategory cat = mozilla::unicode::GetGenCategory(ch);
    if (cat == nsUGenCategory::kLetter || cat == nsUGenCategory::kMark) {
      if (!inWord) {
        inWord = true;
        wordStart = i;
      }
      wordLimit = i + chLen;
      if (i + chLen < aString.Length()) {
        continue;
      }
    }

    if (inWord) {
      // Convert the word to utf-8 for libhyphen, lowercasing it as we go
      // so that it will match the (lowercased) patterns (bug 1105644).
      nsAutoCString utf8;
      const char16_t* const begin = aString.BeginReading();
      const char16_t *cur = begin + wordStart;
      const char16_t *end = begin + wordLimit;
      while (cur < end) {
        uint32_t ch = *cur++;

        if (NS_IS_HIGH_SURROGATE(ch)) {
          if (cur < end && NS_IS_LOW_SURROGATE(*cur)) {
            ch = SURROGATE_TO_UCS4(ch, *cur++);
          } else {
            ch = 0xfffd; // unpaired surrogate, treat as REPLACEMENT CHAR
          }
        } else if (NS_IS_LOW_SURROGATE(ch)) {
          ch = 0xfffd; // unpaired surrogate
        }

        // XXX What about language-specific casing? Consider Turkish I/i...
        // In practice, it looks like the current patterns will not be
        // affected by this, as they treat dotted and undotted i similarly.
        ch = ToLowerCase(ch);

        if (ch < 0x80) { // U+0000 - U+007F
          utf8.Append(ch);
        } else if (ch < 0x0800) { // U+0100 - U+07FF
          utf8.Append(0xC0 | (ch >> 6));
          utf8.Append(0x80 | (0x003F & ch));
        } else if (ch < 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF
          utf8.Append(0xE0 | (ch >> 12));
          utf8.Append(0x80 | (0x003F & (ch >> 6)));
          utf8.Append(0x80 | (0x003F & ch));
        } else {
          utf8.Append(0xF0 | (ch >> 18));
          utf8.Append(0x80 | (0x003F & (ch >> 12)));
          utf8.Append(0x80 | (0x003F & (ch >> 6)));
          utf8.Append(0x80 | (0x003F & ch));
        }
      }

      AutoTArray<char,200> utf8hyphens;
      utf8hyphens.SetLength(utf8.Length() + 5);
      char **rep = nullptr;
      int *pos = nullptr;
      int *cut = nullptr;
      int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict,
                                      utf8.BeginReading(), utf8.Length(),
                                      utf8hyphens.Elements(), nullptr,
                                      &rep, &pos, &cut);
      if (!err) {
        // Surprisingly, hnj_hyphen_hyphenate2 converts the 'hyphens' buffer
        // from utf8 code unit indexing (which would match the utf8 input
        // string directly) to Unicode character indexing.
        // We then need to convert this to utf16 code unit offsets for Gecko.
        const char *hyphPtr = utf8hyphens.Elements();
        const char16_t *cur = begin + wordStart;
        const char16_t *end = begin + wordLimit;
        while (cur < end) {
          if (*hyphPtr & 0x01) {
            aHyphens[cur - begin] = true;
          }
          cur++;
          if (cur < end && NS_IS_LOW_SURROGATE(*cur) &&
              NS_IS_HIGH_SURROGATE(*(cur-1)))
          {
            cur++;
          }
          hyphPtr++;
        }
      }
    }

    inWord = false;
  }

  return NS_OK;
}

Line	Count	Source (jump to first uncovered line)
1		/* -- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -- */
2		/* This Source Code Form is subject to the terms of the Mozilla Public
3		* License, v. 2.0. If a copy of the MPL was not distributed with this
4		* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6		#include "nsHyphenator.h"
7		#include "nsIFile.h"
8		#include "nsUTF8Utils.h"
9		#include "nsUnicodeProperties.h"
10		#include "nsIURI.h"
11
12		#include "hyphen.h"
13
14		nsHyphenator::nsHyphenator(nsIURI *aURI)
15		: mDict(nullptr)
16	0	{
17	0	nsCString uriSpec;
18	0	nsresult rv = aURI->GetSpec(uriSpec);
19	0	if (NS_FAILED(rv)) {
20	0	return;
21	0	}
22	0	mDict = hnj_hyphen_load(uriSpec.get());
23		#ifdef DEBUG
24		if (mDict) {
25		printf("loaded hyphenation patterns from %s\n", uriSpec.get());
26		}
27		#endif
28		}
29
30		nsHyphenator::~nsHyphenator()
31	0	{
32	0	if (mDict != nullptr) {
33	0	hnj_hyphen_free((HyphenDict*)mDict);
34	0	mDict = nullptr;
35	0	}
36	0	}
37
38		bool
39		nsHyphenator::IsValid()
40	0	{
41	0	return (mDict != nullptr);
42	0	}
43
44		nsresult
45		nsHyphenator::Hyphenate(const nsAString& aString, nsTArray<bool>& aHyphens)
46	0	{
47	0	if (!aHyphens.SetLength(aString.Length(), mozilla::fallible)) {
48	0	return NS_ERROR_OUT_OF_MEMORY;
49	0	}
50	0	memset(aHyphens.Elements(), false, aHyphens.Length() * sizeof(bool));
51	0
52	0	bool inWord = false;
53	0	uint32_t wordStart = 0, wordLimit = 0;
54	0	uint32_t chLen;
55	0	for (uint32_t i = 0; i < aString.Length(); i += chLen) {
56	0	uint32_t ch = aString[i];
57	0	chLen = 1;
58	0
59	0	if (NS_IS_HIGH_SURROGATE(ch)) {
60	0	if (i + 1 < aString.Length() && NS_IS_LOW_SURROGATE(aString[i+1])) {
61	0	ch = SURROGATE_TO_UCS4(ch, aString[i+1]);
62	0	chLen = 2;
63	0	} else {
64	0	NS_WARNING("unpaired surrogate found during hyphenation");
65	0	}
66	0	}
67	0
68	0	nsUGenCategory cat = mozilla::unicode::GetGenCategory(ch);
69	0	if (cat == nsUGenCategory::kLetter \|\| cat == nsUGenCategory::kMark) {
70	0	if (!inWord) {
71	0	inWord = true;
72	0	wordStart = i;
73	0	}
74	0	wordLimit = i + chLen;
75	0	if (i + chLen < aString.Length()) {
76	0	continue;
77	0	}
78	0	}
79	0
80	0	if (inWord) {
81	0	// Convert the word to utf-8 for libhyphen, lowercasing it as we go
82	0	// so that it will match the (lowercased) patterns (bug 1105644).
83	0	nsAutoCString utf8;
84	0	const char16_t* const begin = aString.BeginReading();
85	0	const char16_t *cur = begin + wordStart;
86	0	const char16_t *end = begin + wordLimit;
87	0	while (cur < end) {
88	0	uint32_t ch = *cur++;
89	0
90	0	if (NS_IS_HIGH_SURROGATE(ch)) {
91	0	if (cur < end && NS_IS_LOW_SURROGATE(*cur)) {
92	0	ch = SURROGATE_TO_UCS4(ch, *cur++);
93	0	} else {
94	0	ch = 0xfffd; // unpaired surrogate, treat as REPLACEMENT CHAR
95	0	}
96	0	} else if (NS_IS_LOW_SURROGATE(ch)) {
97	0	ch = 0xfffd; // unpaired surrogate
98	0	}
99	0
100	0	// XXX What about language-specific casing? Consider Turkish I/i...
101	0	// In practice, it looks like the current patterns will not be
102	0	// affected by this, as they treat dotted and undotted i similarly.
103	0	ch = ToLowerCase(ch);
104	0
105	0	if (ch < 0x80) { // U+0000 - U+007F
106	0	utf8.Append(ch);
107	0	} else if (ch < 0x0800) { // U+0100 - U+07FF
108	0	utf8.Append(0xC0 \| (ch >> 6));
109	0	utf8.Append(0x80 \| (0x003F & ch));
110	0	} else if (ch < 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF
111	0	utf8.Append(0xE0 \| (ch >> 12));
112	0	utf8.Append(0x80 \| (0x003F & (ch >> 6)));
113	0	utf8.Append(0x80 \| (0x003F & ch));
114	0	} else {
115	0	utf8.Append(0xF0 \| (ch >> 18));
116	0	utf8.Append(0x80 \| (0x003F & (ch >> 12)));
117	0	utf8.Append(0x80 \| (0x003F & (ch >> 6)));
118	0	utf8.Append(0x80 \| (0x003F & ch));
119	0	}
120	0	}
121	0
122	0	AutoTArray<char,200> utf8hyphens;
123	0	utf8hyphens.SetLength(utf8.Length() + 5);
124	0	char **rep = nullptr;
125	0	int *pos = nullptr;
126	0	int *cut = nullptr;
127	0	int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict,
128	0	utf8.BeginReading(), utf8.Length(),
129	0	utf8hyphens.Elements(), nullptr,
130	0	&rep, &pos, &cut);
131	0	if (!err) {
132	0	// Surprisingly, hnj_hyphen_hyphenate2 converts the 'hyphens' buffer
133	0	// from utf8 code unit indexing (which would match the utf8 input
134	0	// string directly) to Unicode character indexing.
135	0	// We then need to convert this to utf16 code unit offsets for Gecko.
136	0	const char *hyphPtr = utf8hyphens.Elements();
137	0	const char16_t *cur = begin + wordStart;
138	0	const char16_t *end = begin + wordLimit;
139	0	while (cur < end) {
140	0	if (*hyphPtr & 0x01) {
141	0	aHyphens[cur - begin] = true;
142	0	}
143	0	cur++;
144	0	if (cur < end && NS_IS_LOW_SURROGATE(*cur) &&
145	0	NS_IS_HIGH_SURROGATE(*(cur-1)))
146	0	{
147	0	cur++;
148	0	}
149	0	hyphPtr++;
150	0	}
151	0	}
152	0	}
153	0
154	0	inWord = false;
155	0	}
156	0
157	0	return NS_OK;
158	0	}

Coverage Report

Created: 2018-09-25 14:53