backend v4 half
This commit is contained in:
Generated
Vendored
+86
@@ -0,0 +1,86 @@
|
||||
// Copy-pasted from `PhoneNumberMatcher.js`.
|
||||
|
||||
import { PLUS_CHARS } from '../constants.js'
|
||||
import { limit } from './util.js'
|
||||
|
||||
import {
|
||||
isLatinLetter,
|
||||
isInvalidPunctuationSymbol
|
||||
} from './utf-8.js'
|
||||
|
||||
const OPENING_PARENS = '(\\[\uFF08\uFF3B'
|
||||
const CLOSING_PARENS = ')\\]\uFF09\uFF3D'
|
||||
const NON_PARENS = `[^${OPENING_PARENS}${CLOSING_PARENS}]`
|
||||
|
||||
export const LEAD_CLASS = `[${OPENING_PARENS}${PLUS_CHARS}]`
|
||||
|
||||
// Punctuation that may be at the start of a phone number - brackets and plus signs.
|
||||
const LEAD_CLASS_LEADING = new RegExp('^' + LEAD_CLASS)
|
||||
|
||||
// Limit on the number of pairs of brackets in a phone number.
|
||||
const BRACKET_PAIR_LIMIT = limit(0, 3)
|
||||
|
||||
/**
|
||||
* Pattern to check that brackets match. Opening brackets should be closed within a phone number.
|
||||
* This also checks that there is something inside the brackets. Having no brackets at all is also
|
||||
* fine.
|
||||
*
|
||||
* An opening bracket at the beginning may not be closed, but subsequent ones should be. It's
|
||||
* also possible that the leading bracket was dropped, so we shouldn't be surprised if we see a
|
||||
* closing bracket first. We limit the sets of brackets in a phone number to four.
|
||||
*/
|
||||
const MATCHING_BRACKETS_ENTIRE = new RegExp
|
||||
(
|
||||
'^'
|
||||
+ "(?:[" + OPENING_PARENS + "])?" + "(?:" + NON_PARENS + "+" + "[" + CLOSING_PARENS + "])?"
|
||||
+ NON_PARENS + "+"
|
||||
+ "(?:[" + OPENING_PARENS + "]" + NON_PARENS + "+[" + CLOSING_PARENS + "])" + BRACKET_PAIR_LIMIT
|
||||
+ NON_PARENS + "*"
|
||||
+ '$'
|
||||
)
|
||||
|
||||
/**
|
||||
* Matches strings that look like publication pages. Example:
|
||||
* <pre>Computing Complete Answers to Queries in the Presence of Limited Access Patterns.
|
||||
* Chen Li. VLDB J. 12(3): 211-227 (2003).</pre>
|
||||
*
|
||||
* The string "211-227 (2003)" is not a telephone number.
|
||||
*/
|
||||
const PUB_PAGES = /\d{1,5}-+\d{1,5}\s{0,4}\(\d{1,4}/
|
||||
|
||||
export default function isValidCandidate(candidate, offset, text, leniency)
|
||||
{
|
||||
// Check the candidate doesn't contain any formatting
|
||||
// which would indicate that it really isn't a phone number.
|
||||
if (!MATCHING_BRACKETS_ENTIRE.test(candidate) || PUB_PAGES.test(candidate)) {
|
||||
return
|
||||
}
|
||||
|
||||
// If leniency is set to VALID or stricter, we also want to skip numbers that are surrounded
|
||||
// by Latin alphabetic characters, to skip cases like abc8005001234 or 8005001234def.
|
||||
if (leniency !== 'POSSIBLE')
|
||||
{
|
||||
// If the candidate is not at the start of the text,
|
||||
// and does not start with phone-number punctuation,
|
||||
// check the previous character.
|
||||
if (offset > 0 && !LEAD_CLASS_LEADING.test(candidate))
|
||||
{
|
||||
const previousChar = text[offset - 1]
|
||||
// We return null if it is a latin letter or an invalid punctuation symbol.
|
||||
if (isInvalidPunctuationSymbol(previousChar) || isLatinLetter(previousChar)) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
const lastCharIndex = offset + candidate.length
|
||||
if (lastCharIndex < text.length)
|
||||
{
|
||||
const nextChar = text[lastCharIndex]
|
||||
if (isInvalidPunctuationSymbol(nextChar) || isLatinLetter(nextChar)) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
Reference in New Issue
Block a user