170 lines
6.5 KiB
JavaScript
170 lines
6.5 KiB
JavaScript
"use strict";
|
||
Object.defineProperty(exports, "__esModule", { value: true });
|
||
const boundaries_1 = require("./boundaries");
|
||
// BreakTypes
|
||
// @type {BreakType}
|
||
const NotBreak = 0;
|
||
const BreakStart = 1;
|
||
const Break = 2;
|
||
const BreakLastRegional = 3;
|
||
const BreakPenultimateRegional = 4;
|
||
class GraphemerHelper {
|
||
/**
|
||
* Check if the the character at the position {pos} of the string is surrogate
|
||
* @param str {string}
|
||
* @param pos {number}
|
||
* @returns {boolean}
|
||
*/
|
||
static isSurrogate(str, pos) {
|
||
return (0xd800 <= str.charCodeAt(pos) &&
|
||
str.charCodeAt(pos) <= 0xdbff &&
|
||
0xdc00 <= str.charCodeAt(pos + 1) &&
|
||
str.charCodeAt(pos + 1) <= 0xdfff);
|
||
}
|
||
/**
|
||
* The String.prototype.codePointAt polyfill
|
||
* Private function, gets a Unicode code point from a JavaScript UTF-16 string
|
||
* handling surrogate pairs appropriately
|
||
* @param str {string}
|
||
* @param idx {number}
|
||
* @returns {number}
|
||
*/
|
||
static codePointAt(str, idx) {
|
||
if (idx === undefined) {
|
||
idx = 0;
|
||
}
|
||
const code = str.charCodeAt(idx);
|
||
// if a high surrogate
|
||
if (0xd800 <= code && code <= 0xdbff && idx < str.length - 1) {
|
||
const hi = code;
|
||
const low = str.charCodeAt(idx + 1);
|
||
if (0xdc00 <= low && low <= 0xdfff) {
|
||
return (hi - 0xd800) * 0x400 + (low - 0xdc00) + 0x10000;
|
||
}
|
||
return hi;
|
||
}
|
||
// if a low surrogate
|
||
if (0xdc00 <= code && code <= 0xdfff && idx >= 1) {
|
||
const hi = str.charCodeAt(idx - 1);
|
||
const low = code;
|
||
if (0xd800 <= hi && hi <= 0xdbff) {
|
||
return (hi - 0xd800) * 0x400 + (low - 0xdc00) + 0x10000;
|
||
}
|
||
return low;
|
||
}
|
||
// just return the char if an unmatched surrogate half or a
|
||
// single-char codepoint
|
||
return code;
|
||
}
|
||
//
|
||
/**
|
||
* Private function, returns whether a break is allowed between the two given grapheme breaking classes
|
||
* Implemented the UAX #29 3.1.1 Grapheme Cluster Boundary Rules on extended grapheme clusters
|
||
* @param start {number}
|
||
* @param mid {Array<number>}
|
||
* @param end {number}
|
||
* @param startEmoji {number}
|
||
* @param midEmoji {Array<number>}
|
||
* @param endEmoji {number}
|
||
* @returns {number}
|
||
*/
|
||
static shouldBreak(start, mid, end, startEmoji, midEmoji, endEmoji) {
|
||
const all = [start].concat(mid).concat([end]);
|
||
const allEmoji = [startEmoji].concat(midEmoji).concat([endEmoji]);
|
||
const previous = all[all.length - 2];
|
||
const next = end;
|
||
const nextEmoji = endEmoji;
|
||
// Lookahead terminator for:
|
||
// GB12. ^ (RI RI)* RI ? RI
|
||
// GB13. [^RI] (RI RI)* RI ? RI
|
||
const rIIndex = all.lastIndexOf(boundaries_1.CLUSTER_BREAK.REGIONAL_INDICATOR);
|
||
if (rIIndex > 0 &&
|
||
all.slice(1, rIIndex).every(function (c) {
|
||
return c === boundaries_1.CLUSTER_BREAK.REGIONAL_INDICATOR;
|
||
}) &&
|
||
[boundaries_1.CLUSTER_BREAK.PREPEND, boundaries_1.CLUSTER_BREAK.REGIONAL_INDICATOR].indexOf(previous) === -1) {
|
||
if (all.filter(function (c) {
|
||
return c === boundaries_1.CLUSTER_BREAK.REGIONAL_INDICATOR;
|
||
}).length %
|
||
2 ===
|
||
1) {
|
||
return BreakLastRegional;
|
||
}
|
||
else {
|
||
return BreakPenultimateRegional;
|
||
}
|
||
}
|
||
// GB3. CR × LF
|
||
if (previous === boundaries_1.CLUSTER_BREAK.CR && next === boundaries_1.CLUSTER_BREAK.LF) {
|
||
return NotBreak;
|
||
}
|
||
// GB4. (Control|CR|LF) ÷
|
||
else if (previous === boundaries_1.CLUSTER_BREAK.CONTROL ||
|
||
previous === boundaries_1.CLUSTER_BREAK.CR ||
|
||
previous === boundaries_1.CLUSTER_BREAK.LF) {
|
||
return BreakStart;
|
||
}
|
||
// GB5. ÷ (Control|CR|LF)
|
||
else if (next === boundaries_1.CLUSTER_BREAK.CONTROL ||
|
||
next === boundaries_1.CLUSTER_BREAK.CR ||
|
||
next === boundaries_1.CLUSTER_BREAK.LF) {
|
||
return BreakStart;
|
||
}
|
||
// GB6. L × (L|V|LV|LVT)
|
||
else if (previous === boundaries_1.CLUSTER_BREAK.L &&
|
||
(next === boundaries_1.CLUSTER_BREAK.L ||
|
||
next === boundaries_1.CLUSTER_BREAK.V ||
|
||
next === boundaries_1.CLUSTER_BREAK.LV ||
|
||
next === boundaries_1.CLUSTER_BREAK.LVT)) {
|
||
return NotBreak;
|
||
}
|
||
// GB7. (LV|V) × (V|T)
|
||
else if ((previous === boundaries_1.CLUSTER_BREAK.LV || previous === boundaries_1.CLUSTER_BREAK.V) &&
|
||
(next === boundaries_1.CLUSTER_BREAK.V || next === boundaries_1.CLUSTER_BREAK.T)) {
|
||
return NotBreak;
|
||
}
|
||
// GB8. (LVT|T) × (T)
|
||
else if ((previous === boundaries_1.CLUSTER_BREAK.LVT || previous === boundaries_1.CLUSTER_BREAK.T) &&
|
||
next === boundaries_1.CLUSTER_BREAK.T) {
|
||
return NotBreak;
|
||
}
|
||
// GB9. × (Extend|ZWJ)
|
||
else if (next === boundaries_1.CLUSTER_BREAK.EXTEND || next === boundaries_1.CLUSTER_BREAK.ZWJ) {
|
||
return NotBreak;
|
||
}
|
||
// GB9a. × SpacingMark
|
||
else if (next === boundaries_1.CLUSTER_BREAK.SPACINGMARK) {
|
||
return NotBreak;
|
||
}
|
||
// GB9b. Prepend ×
|
||
else if (previous === boundaries_1.CLUSTER_BREAK.PREPEND) {
|
||
return NotBreak;
|
||
}
|
||
// GB11. \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
|
||
const previousNonExtendIndex = allEmoji
|
||
.slice(0, -1)
|
||
.lastIndexOf(boundaries_1.EXTENDED_PICTOGRAPHIC);
|
||
if (previousNonExtendIndex !== -1 &&
|
||
allEmoji[previousNonExtendIndex] === boundaries_1.EXTENDED_PICTOGRAPHIC &&
|
||
all.slice(previousNonExtendIndex + 1, -2).every(function (c) {
|
||
return c === boundaries_1.CLUSTER_BREAK.EXTEND;
|
||
}) &&
|
||
previous === boundaries_1.CLUSTER_BREAK.ZWJ &&
|
||
nextEmoji === boundaries_1.EXTENDED_PICTOGRAPHIC) {
|
||
return NotBreak;
|
||
}
|
||
// GB12. ^ (RI RI)* RI × RI
|
||
// GB13. [^RI] (RI RI)* RI × RI
|
||
if (mid.indexOf(boundaries_1.CLUSTER_BREAK.REGIONAL_INDICATOR) !== -1) {
|
||
return Break;
|
||
}
|
||
if (previous === boundaries_1.CLUSTER_BREAK.REGIONAL_INDICATOR &&
|
||
next === boundaries_1.CLUSTER_BREAK.REGIONAL_INDICATOR) {
|
||
return NotBreak;
|
||
}
|
||
// GB999. Any ? Any
|
||
return BreakStart;
|
||
}
|
||
}
|
||
exports.default = GraphemerHelper;
|