470 lines
11 KiB
JavaScript
470 lines
11 KiB
JavaScript
'use strict'
|
|
|
|
var createParser = require('./parser')
|
|
var expressions = require('./expressions')
|
|
|
|
module.exports = ParseLatin
|
|
|
|
// PARSE LATIN
|
|
|
|
// Transform Latin-script natural language into an NLCST-tree.
|
|
function ParseLatin(doc, file) {
|
|
var value = file || doc
|
|
|
|
if (!(this instanceof ParseLatin)) {
|
|
return new ParseLatin(doc, file)
|
|
}
|
|
|
|
this.doc = value ? String(value) : null
|
|
}
|
|
|
|
// Quick access to the prototype.
|
|
var proto = ParseLatin.prototype
|
|
|
|
// Default position.
|
|
proto.position = true
|
|
|
|
// Create text nodes.
|
|
proto.tokenizeSymbol = createTextFactory('Symbol')
|
|
proto.tokenizeWhiteSpace = createTextFactory('WhiteSpace')
|
|
proto.tokenizePunctuation = createTextFactory('Punctuation')
|
|
proto.tokenizeSource = createTextFactory('Source')
|
|
proto.tokenizeText = createTextFactory('Text')
|
|
|
|
// Expose `run`.
|
|
proto.run = run
|
|
|
|
// Inject `plugins` to modifiy the result of the method at `key` on the operated
|
|
// on context.
|
|
proto.use = useFactory(function(context, key, plugins) {
|
|
context[key] = context[key].concat(plugins)
|
|
})
|
|
|
|
// Inject `plugins` to modifiy the result of the method at `key` on the operated
|
|
// on context, before any other.
|
|
proto.useFirst = useFactory(function(context, key, plugins) {
|
|
context[key] = plugins.concat(context[key])
|
|
})
|
|
|
|
// Easy access to the document parser. This additionally supports retext-style
|
|
// invocation: where an instance is created for each file, and the file is given
|
|
// on construction.
|
|
proto.parse = function(value) {
|
|
return this.tokenizeRoot(value || this.doc)
|
|
}
|
|
|
|
// Transform a `value` into a list of `NLCSTNode`s.
|
|
proto.tokenize = function(value) {
|
|
return tokenize(this, value)
|
|
}
|
|
|
|
// PARENT NODES
|
|
//
|
|
// All these nodes are `pluggable`: they come with a `use` method which accepts
|
|
// a plugin (`function(NLCSTNode)`).
|
|
// Every time one of these methods are called, the plugin is invoked with the
|
|
// node, allowing for easy modification.
|
|
//
|
|
// In fact, the internal transformation from `tokenize` (a list of words, white
|
|
// space, punctuation, and symbols) to `tokenizeRoot` (an NLCST tree), is also
|
|
// implemented through this mechanism.
|
|
|
|
// Create a `WordNode` with its children set to a single `TextNode`, its value
|
|
// set to the given `value`.
|
|
pluggable(ParseLatin, 'tokenizeWord', function(value, eat) {
|
|
var add = (eat || noopEat)('')
|
|
var parent = {type: 'WordNode', children: []}
|
|
|
|
this.tokenizeText(value, eat, parent)
|
|
|
|
return add(parent)
|
|
})
|
|
|
|
// Create a `SentenceNode` with its children set to `Node`s, their values set
|
|
// to the tokenized given `value`.
|
|
//
|
|
// Unless plugins add new nodes, the sentence is populated by `WordNode`s,
|
|
// `SymbolNode`s, `PunctuationNode`s, and `WhiteSpaceNode`s.
|
|
pluggable(
|
|
ParseLatin,
|
|
'tokenizeSentence',
|
|
createParser({
|
|
type: 'SentenceNode',
|
|
tokenizer: 'tokenize'
|
|
})
|
|
)
|
|
|
|
// Create a `ParagraphNode` with its children set to `Node`s, their values set
|
|
// to the tokenized given `value`.
|
|
//
|
|
// Unless plugins add new nodes, the paragraph is populated by `SentenceNode`s
|
|
// and `WhiteSpaceNode`s.
|
|
pluggable(
|
|
ParseLatin,
|
|
'tokenizeParagraph',
|
|
createParser({
|
|
type: 'ParagraphNode',
|
|
delimiter: expressions.terminalMarker,
|
|
delimiterType: 'PunctuationNode',
|
|
tokenizer: 'tokenizeSentence'
|
|
})
|
|
)
|
|
|
|
// Create a `RootNode` with its children set to `Node`s, their values set to the
|
|
// tokenized given `value`.
|
|
pluggable(
|
|
ParseLatin,
|
|
'tokenizeRoot',
|
|
createParser({
|
|
type: 'RootNode',
|
|
delimiter: expressions.newLine,
|
|
delimiterType: 'WhiteSpaceNode',
|
|
tokenizer: 'tokenizeParagraph'
|
|
})
|
|
)
|
|
|
|
// PLUGINS
|
|
|
|
proto.use('tokenizeSentence', [
|
|
require('./plugin/merge-initial-word-symbol'),
|
|
require('./plugin/merge-final-word-symbol'),
|
|
require('./plugin/merge-inner-word-symbol'),
|
|
require('./plugin/merge-inner-word-slash'),
|
|
require('./plugin/merge-initialisms'),
|
|
require('./plugin/merge-words'),
|
|
require('./plugin/patch-position')
|
|
])
|
|
|
|
proto.use('tokenizeParagraph', [
|
|
require('./plugin/merge-non-word-sentences'),
|
|
require('./plugin/merge-affix-symbol'),
|
|
require('./plugin/merge-initial-lower-case-letter-sentences'),
|
|
require('./plugin/merge-initial-digit-sentences'),
|
|
require('./plugin/merge-prefix-exceptions'),
|
|
require('./plugin/merge-affix-exceptions'),
|
|
require('./plugin/merge-remaining-full-stops'),
|
|
require('./plugin/make-initial-white-space-siblings'),
|
|
require('./plugin/make-final-white-space-siblings'),
|
|
require('./plugin/break-implicit-sentences'),
|
|
require('./plugin/remove-empty-nodes'),
|
|
require('./plugin/patch-position')
|
|
])
|
|
|
|
proto.use('tokenizeRoot', [
|
|
require('./plugin/make-initial-white-space-siblings'),
|
|
require('./plugin/make-final-white-space-siblings'),
|
|
require('./plugin/remove-empty-nodes'),
|
|
require('./plugin/patch-position')
|
|
])
|
|
|
|
// TEXT NODES
|
|
|
|
// Factory to create a `Text`.
|
|
function createTextFactory(type) {
|
|
type += 'Node'
|
|
|
|
return createText
|
|
|
|
// Construct a `Text` from a bound `type`
|
|
function createText(value, eat, parent) {
|
|
if (value === null || value === undefined) {
|
|
value = ''
|
|
}
|
|
|
|
return (eat || noopEat)(value)(
|
|
{
|
|
type: type,
|
|
value: String(value)
|
|
},
|
|
parent
|
|
)
|
|
}
|
|
}
|
|
|
|
// Run transform plug-ins for `key` on `nodes`.
|
|
function run(key, nodes) {
|
|
var wareKey = key + 'Plugins'
|
|
var plugins = this[wareKey]
|
|
var index = -1
|
|
|
|
if (plugins) {
|
|
while (plugins[++index]) {
|
|
plugins[index](nodes)
|
|
}
|
|
}
|
|
|
|
return nodes
|
|
}
|
|
|
|
// Make a method “pluggable”.
|
|
function pluggable(Constructor, key, callback) {
|
|
// Set a pluggable version of `callback` on `Constructor`.
|
|
Constructor.prototype[key] = function() {
|
|
return this.run(key, callback.apply(this, arguments))
|
|
}
|
|
}
|
|
|
|
// Factory to inject `plugins`. Takes `callback` for the actual inserting.
|
|
function useFactory(callback) {
|
|
return use
|
|
|
|
// Validate if `plugins` can be inserted.
|
|
// Invokes the bound `callback` to do the actual inserting.
|
|
function use(key, plugins) {
|
|
var self = this
|
|
var wareKey
|
|
|
|
// Throw if the method is not pluggable.
|
|
if (!(key in self)) {
|
|
throw new Error(
|
|
'Illegal Invocation: Unsupported `key` for ' +
|
|
'`use(key, plugins)`. Make sure `key` is a ' +
|
|
'supported function'
|
|
)
|
|
}
|
|
|
|
// Fail silently when no plugins are given.
|
|
if (!plugins) {
|
|
return
|
|
}
|
|
|
|
wareKey = key + 'Plugins'
|
|
|
|
// Make sure `plugins` is a list.
|
|
if (typeof plugins === 'function') {
|
|
plugins = [plugins]
|
|
} else {
|
|
plugins = plugins.concat()
|
|
}
|
|
|
|
// Make sure `wareKey` exists.
|
|
if (!self[wareKey]) {
|
|
self[wareKey] = []
|
|
}
|
|
|
|
// Invoke callback with the ware key and plugins.
|
|
callback(self, wareKey, plugins)
|
|
}
|
|
}
|
|
|
|
// CLASSIFY
|
|
|
|
// Match a word character.
|
|
var wordRe = expressions.word
|
|
|
|
// Match a surrogate character.
|
|
var surrogatesRe = expressions.surrogates
|
|
|
|
// Match a punctuation character.
|
|
var punctuationRe = expressions.punctuation
|
|
|
|
// Match a white space character.
|
|
var whiteSpaceRe = expressions.whiteSpace
|
|
|
|
// Transform a `value` into a list of `NLCSTNode`s.
|
|
function tokenize(parser, value) {
|
|
var tokens
|
|
var offset
|
|
var line
|
|
var column
|
|
var index
|
|
var length
|
|
var character
|
|
var queue
|
|
var prev
|
|
var left
|
|
var right
|
|
var eater
|
|
|
|
if (value === null || value === undefined) {
|
|
value = ''
|
|
} else if (value instanceof String) {
|
|
value = value.toString()
|
|
}
|
|
|
|
if (typeof value !== 'string') {
|
|
// Return the given nodes if this is either an empty array, or an array with
|
|
// a node as a first child.
|
|
if ('length' in value && (!value[0] || value[0].type)) {
|
|
return value
|
|
}
|
|
|
|
throw new Error(
|
|
"Illegal invocation: '" +
|
|
value +
|
|
"' is not a valid argument for 'ParseLatin'"
|
|
)
|
|
}
|
|
|
|
tokens = []
|
|
|
|
if (!value) {
|
|
return tokens
|
|
}
|
|
|
|
index = 0
|
|
offset = 0
|
|
line = 1
|
|
column = 1
|
|
|
|
// Eat mechanism to use.
|
|
eater = parser.position ? eat : noPositionEat
|
|
|
|
length = value.length
|
|
prev = ''
|
|
queue = ''
|
|
|
|
while (index < length) {
|
|
character = value.charAt(index)
|
|
|
|
if (whiteSpaceRe.test(character)) {
|
|
right = 'WhiteSpace'
|
|
} else if (punctuationRe.test(character)) {
|
|
right = 'Punctuation'
|
|
} else if (wordRe.test(character)) {
|
|
right = 'Word'
|
|
} else {
|
|
right = 'Symbol'
|
|
}
|
|
|
|
tick()
|
|
|
|
prev = character
|
|
character = ''
|
|
left = right
|
|
right = null
|
|
|
|
index++
|
|
}
|
|
|
|
tick()
|
|
|
|
return tokens
|
|
|
|
// Check one character.
|
|
function tick() {
|
|
if (
|
|
left === right &&
|
|
(left === 'Word' ||
|
|
left === 'WhiteSpace' ||
|
|
character === prev ||
|
|
surrogatesRe.test(character))
|
|
) {
|
|
queue += character
|
|
} else {
|
|
// Flush the previous queue.
|
|
if (queue) {
|
|
parser['tokenize' + left](queue, eater)
|
|
}
|
|
|
|
queue = character
|
|
}
|
|
}
|
|
|
|
// Remove `subvalue` from `value`.
|
|
// Expects `subvalue` to be at the start from `value`, and applies no
|
|
// validation.
|
|
function eat(subvalue) {
|
|
var pos = position()
|
|
|
|
update(subvalue)
|
|
|
|
return apply
|
|
|
|
// Add the given arguments, add `position` to the returned node, and return
|
|
// the node.
|
|
function apply() {
|
|
return pos(add.apply(null, arguments))
|
|
}
|
|
}
|
|
|
|
// Remove `subvalue` from `value`.
|
|
// Does not patch positional information.
|
|
function noPositionEat() {
|
|
return apply
|
|
|
|
// Add the given arguments and return the node.
|
|
function apply() {
|
|
return add.apply(null, arguments)
|
|
}
|
|
}
|
|
|
|
// Add mechanism.
|
|
function add(node, parent) {
|
|
if (parent) {
|
|
parent.children.push(node)
|
|
} else {
|
|
tokens.push(node)
|
|
}
|
|
|
|
return node
|
|
}
|
|
|
|
// Mark position and patch `node.position`.
|
|
function position() {
|
|
var before = now()
|
|
|
|
// Add the position to a node.
|
|
function patch(node) {
|
|
node.position = new Position(before)
|
|
|
|
return node
|
|
}
|
|
|
|
return patch
|
|
}
|
|
|
|
// Update line and column based on `value`.
|
|
function update(subvalue) {
|
|
var subvalueLength = subvalue.length
|
|
var character = -1
|
|
var lastIndex = -1
|
|
|
|
offset += subvalueLength
|
|
|
|
while (++character < subvalueLength) {
|
|
if (subvalue.charAt(character) === '\n') {
|
|
lastIndex = character
|
|
line++
|
|
}
|
|
}
|
|
|
|
if (lastIndex === -1) {
|
|
column += subvalueLength
|
|
} else {
|
|
column = subvalueLength - lastIndex
|
|
}
|
|
}
|
|
|
|
// Store position information for a node.
|
|
function Position(start) {
|
|
this.start = start
|
|
this.end = now()
|
|
}
|
|
|
|
// Get the current position.
|
|
function now() {
|
|
return {
|
|
line: line,
|
|
column: column,
|
|
offset: offset
|
|
}
|
|
}
|
|
}
|
|
|
|
// Add mechanism used when text-tokenisers are called directly outside of the
|
|
// `tokenize` function.
|
|
function noopAdd(node, parent) {
|
|
if (parent) {
|
|
parent.children.push(node)
|
|
}
|
|
|
|
return node
|
|
}
|
|
|
|
// Eat and add mechanism without adding positional information, used when
|
|
// text-tokenisers are called directly outside of the `tokenize` function.
|
|
function noopEat() {
|
|
return noopAdd
|
|
}
|