mirror of
https://github.com/tooot-app/app
synced 2025-03-13 01:50:08 +01:00
https://github.com/gregjacobs/Autolinker.js/pull/240/ https://github.com/gregjacobs/Autolinker.js/issues/130
313 lines
15 KiB
JavaScript
313 lines
15 KiB
JavaScript
import * as tslib_1 from "tslib";
|
|
import { Matcher } from "./matcher";
|
|
import { alphaNumericCharsStr, alphaNumericAndMarksCharsStr, getDomainNameStr } from "../regex-lib";
|
|
import { tldRegex } from "./tld-regex";
|
|
import { UrlMatch } from "../match/url-match";
|
|
import { UrlMatchValidator } from "./url-match-validator";
|
|
// RegExp objects which are shared by all instances of UrlMatcher. These are
|
|
// here to avoid re-instantiating the RegExp objects if `Autolinker.link()` is
|
|
// called multiple times, thus instantiating UrlMatcher and its RegExp
|
|
// objects each time (which is very expensive - see https://github.com/gregjacobs/Autolinker.js/issues/314).
|
|
// See descriptions of the properties where they are used for details about them
|
|
var matcherRegex = (function () {
|
|
var schemeRegex = /(?:[A-Za-z][-.+A-Za-z0-9]{0,63}:(?![A-Za-z][-.+A-Za-z0-9]{0,63}:\/\/)(?!\d+\/?)(?:\/\/)?)/, // match protocol, allow in format "http://" or "mailto:". However, do not match the first part of something like 'link:http://www.google.com' (i.e. don't match "link:"). Also, make sure we don't interpret 'google.com:8000' as if 'google.com' was a protocol here (i.e. ignore a trailing port number in this regex)
|
|
wwwRegex = /(?:www\.)/, // starting with 'www.'
|
|
// Allow optional path, query string, and hash anchor, not ending in the following characters: "?!:,.;"
|
|
// http://blog.codinghorror.com/the-problem-with-urls/
|
|
urlSuffixRegex = new RegExp('[/?#](?:[' + alphaNumericAndMarksCharsStr + '\\-+&@#/%=~_()|\'$*\\[\\]{}?!:,.;^\u2713]*[' + alphaNumericAndMarksCharsStr + '\\-+&@#/%=~_()|\'$*\\[\\]{}\u2713])?');
|
|
return new RegExp([
|
|
'(?:',
|
|
'(',
|
|
schemeRegex.source,
|
|
getDomainNameStr(2),
|
|
')',
|
|
'|',
|
|
'(',
|
|
'(//)?',
|
|
wwwRegex.source,
|
|
getDomainNameStr(6),
|
|
')',
|
|
'|',
|
|
'(',
|
|
'(//)?',
|
|
getDomainNameStr(10) + '\\.',
|
|
tldRegex.source,
|
|
'(?![-' + alphaNumericCharsStr + '])',
|
|
')',
|
|
')',
|
|
'(?::[0-9]+)?',
|
|
'(?:' + urlSuffixRegex.source + ')?' // match for path, query string, and/or hash anchor - optional
|
|
].join(""), 'gi');
|
|
})();
|
|
var wordCharRegExp = new RegExp('[' + alphaNumericAndMarksCharsStr + ']');
|
|
/**
|
|
* @class Autolinker.matcher.Url
|
|
* @extends Autolinker.matcher.Matcher
|
|
*
|
|
* Matcher to find URL matches in an input string.
|
|
*
|
|
* See this class's superclass ({@link Autolinker.matcher.Matcher}) for more details.
|
|
*/
|
|
var UrlMatcher = /** @class */ (function (_super) {
|
|
tslib_1.__extends(UrlMatcher, _super);
|
|
/**
|
|
* @method constructor
|
|
* @param {Object} cfg The configuration properties for the Match instance,
|
|
* specified in an Object (map).
|
|
*/
|
|
function UrlMatcher(cfg) {
|
|
var _this = _super.call(this, cfg) || this;
|
|
/**
|
|
* @cfg {Object} stripPrefix (required)
|
|
*
|
|
* The Object form of {@link Autolinker#cfg-stripPrefix}.
|
|
*/
|
|
_this.stripPrefix = { scheme: true, www: true }; // default value just to get the above doc comment in the ES5 output and documentation generator
|
|
/**
|
|
* @cfg {Boolean} stripTrailingSlash (required)
|
|
* @inheritdoc Autolinker#stripTrailingSlash
|
|
*/
|
|
_this.stripTrailingSlash = true; // default value just to get the above doc comment in the ES5 output and documentation generator
|
|
/**
|
|
* @cfg {Boolean} decodePercentEncoding (required)
|
|
* @inheritdoc Autolinker#decodePercentEncoding
|
|
*/
|
|
_this.decodePercentEncoding = true; // default value just to get the above doc comment in the ES5 output and documentation generator
|
|
/**
|
|
* @protected
|
|
* @property {RegExp} matcherRegex
|
|
*
|
|
* The regular expression to match URLs with an optional scheme, port
|
|
* number, path, query string, and hash anchor.
|
|
*
|
|
* Example matches:
|
|
*
|
|
* http://google.com
|
|
* www.google.com
|
|
* google.com/path/to/file?q1=1&q2=2#myAnchor
|
|
*
|
|
*
|
|
* This regular expression will have the following capturing groups:
|
|
*
|
|
* 1. Group that matches a scheme-prefixed URL (i.e. 'http://google.com').
|
|
* This is used to match scheme URLs with just a single word, such as
|
|
* 'http://localhost', where we won't double check that the domain name
|
|
* has at least one dot ('.') in it.
|
|
* 2. Group that matches a 'www.' prefixed URL. This is only matched if the
|
|
* 'www.' text was not prefixed by a scheme (i.e.: not prefixed by
|
|
* 'http://', 'ftp:', etc.)
|
|
* 3. A protocol-relative ('//') match for the case of a 'www.' prefixed
|
|
* URL. Will be an empty string if it is not a protocol-relative match.
|
|
* We need to know the character before the '//' in order to determine
|
|
* if it is a valid match or the // was in a string we don't want to
|
|
* auto-link.
|
|
* 4. Group that matches a known TLD (top level domain), when a scheme
|
|
* or 'www.'-prefixed domain is not matched.
|
|
* 5. A protocol-relative ('//') match for the case of a known TLD prefixed
|
|
* URL. Will be an empty string if it is not a protocol-relative match.
|
|
* See #3 for more info.
|
|
*/
|
|
_this.matcherRegex = matcherRegex;
|
|
/**
|
|
* A regular expression to use to check the character before a protocol-relative
|
|
* URL match. We don't want to match a protocol-relative URL if it is part
|
|
* of another word.
|
|
*
|
|
* For example, we want to match something like "Go to: //google.com",
|
|
* but we don't want to match something like "abc//google.com"
|
|
*
|
|
* This regular expression is used to test the character before the '//'.
|
|
*
|
|
* @protected
|
|
* @type {RegExp} wordCharRegExp
|
|
*/
|
|
_this.wordCharRegExp = wordCharRegExp;
|
|
_this.stripPrefix = cfg.stripPrefix;
|
|
_this.stripTrailingSlash = cfg.stripTrailingSlash;
|
|
_this.decodePercentEncoding = cfg.decodePercentEncoding;
|
|
return _this;
|
|
}
|
|
/**
|
|
* @inheritdoc
|
|
*/
|
|
UrlMatcher.prototype.parseMatches = function (text) {
|
|
var matcherRegex = this.matcherRegex, stripPrefix = this.stripPrefix, stripTrailingSlash = this.stripTrailingSlash, decodePercentEncoding = this.decodePercentEncoding, tagBuilder = this.tagBuilder, matches = [], match;
|
|
var _loop_1 = function () {
|
|
var matchStr = match[0], schemeUrlMatch = match[1], wwwUrlMatch = match[4], wwwProtocolRelativeMatch = match[5],
|
|
//tldUrlMatch = match[ 8 ], -- not needed at the moment
|
|
tldProtocolRelativeMatch = match[9], offset = match.index, protocolRelativeMatch = wwwProtocolRelativeMatch || tldProtocolRelativeMatch, prevChar = text.charAt(offset - 1);
|
|
if (!UrlMatchValidator.isValid(matchStr, schemeUrlMatch)) {
|
|
return "continue";
|
|
}
|
|
// If the match is preceded by an '@' character, then it is either
|
|
// an email address or a username. Skip these types of matches.
|
|
if (offset > 0 && prevChar === '@') {
|
|
return "continue";
|
|
}
|
|
// If it's a protocol-relative '//' match, but the character before the '//'
|
|
// was a word character (i.e. a letter/number), then we found the '//' in the
|
|
// middle of another word (such as "asdf//asdf.com"). In this case, skip the
|
|
// match.
|
|
if (offset > 0 && protocolRelativeMatch && this_1.wordCharRegExp.test(prevChar)) {
|
|
return "continue";
|
|
}
|
|
// If the URL ends with a question mark, don't include the question
|
|
// mark as part of the URL. We'll assume the question mark was the
|
|
// end of a sentence, such as: "Going to google.com?"
|
|
if (/\?$/.test(matchStr)) {
|
|
matchStr = matchStr.substr(0, matchStr.length - 1);
|
|
}
|
|
// Handle a closing parenthesis or square bracket at the end of the
|
|
// match, and exclude it if there is not a matching open parenthesis
|
|
// or square bracket in the match itself.
|
|
if (this_1.matchHasUnbalancedClosingParen(matchStr)) {
|
|
matchStr = matchStr.substr(0, matchStr.length - 1); // remove the trailing ")"
|
|
}
|
|
else {
|
|
// Handle an invalid character after the TLD
|
|
var pos = this_1.matchHasInvalidCharAfterTld(matchStr, schemeUrlMatch);
|
|
if (pos > -1) {
|
|
matchStr = matchStr.substr(0, pos); // remove the trailing invalid chars
|
|
}
|
|
}
|
|
// The autolinker accepts many characters in a url's scheme (like `fake://test.com`).
|
|
// However, in cases where a URL is missing whitespace before an obvious link,
|
|
// (for example: `nowhitespacehttp://www.test.com`), we only want the match to start
|
|
// at the http:// part. We will check if the match contains a common scheme and then
|
|
// shift the match to start from there.
|
|
var foundCommonScheme = ['http://', 'https://'].find(function (commonScheme) { return !!schemeUrlMatch && schemeUrlMatch.indexOf(commonScheme) !== -1; });
|
|
if (foundCommonScheme) {
|
|
// If we found an overmatched URL, we want to find the index
|
|
// of where the match should start and shift the match to
|
|
// start from the beginning of the common scheme
|
|
var indexOfSchemeStart = matchStr.indexOf(foundCommonScheme);
|
|
matchStr = matchStr.substr(indexOfSchemeStart);
|
|
schemeUrlMatch = schemeUrlMatch.substr(indexOfSchemeStart);
|
|
offset = offset + indexOfSchemeStart;
|
|
}
|
|
var urlMatchType = schemeUrlMatch ? 'scheme' : (wwwUrlMatch ? 'www' : 'tld'), protocolUrlMatch = !!schemeUrlMatch;
|
|
matches.push(new UrlMatch({
|
|
tagBuilder: tagBuilder,
|
|
matchedText: matchStr,
|
|
offset: offset,
|
|
urlMatchType: urlMatchType,
|
|
url: matchStr,
|
|
protocolUrlMatch: protocolUrlMatch,
|
|
protocolRelativeMatch: !!protocolRelativeMatch,
|
|
stripPrefix: stripPrefix,
|
|
stripTrailingSlash: stripTrailingSlash,
|
|
decodePercentEncoding: decodePercentEncoding,
|
|
}));
|
|
};
|
|
var this_1 = this;
|
|
while ((match = matcherRegex.exec(text)) !== null) {
|
|
_loop_1();
|
|
}
|
|
return matches;
|
|
};
|
|
/**
|
|
* Determines if a match found has an unmatched closing parenthesis,
|
|
* square bracket or curly bracket. If so, the symbol will be removed
|
|
* from the match itself, and appended after the generated anchor tag.
|
|
*
|
|
* A match may have an extra closing parenthesis at the end of the match
|
|
* because the regular expression must include parenthesis for URLs such as
|
|
* "wikipedia.com/something_(disambiguation)", which should be auto-linked.
|
|
*
|
|
* However, an extra parenthesis *will* be included when the URL itself is
|
|
* wrapped in parenthesis, such as in the case of:
|
|
* "(wikipedia.com/something_(disambiguation))"
|
|
* In this case, the last closing parenthesis should *not* be part of the
|
|
* URL itself, and this method will return `true`.
|
|
*
|
|
* For square brackets in URLs such as in PHP arrays, the same behavior as
|
|
* parenthesis discussed above should happen:
|
|
* "[http://www.example.com/foo.php?bar[]=1&bar[]=2&bar[]=3]"
|
|
* The closing square bracket should not be part of the URL itself, and this
|
|
* method will return `true`.
|
|
*
|
|
* @protected
|
|
* @param {String} matchStr The full match string from the {@link #matcherRegex}.
|
|
* @return {Boolean} `true` if there is an unbalanced closing parenthesis or
|
|
* square bracket at the end of the `matchStr`, `false` otherwise.
|
|
*/
|
|
UrlMatcher.prototype.matchHasUnbalancedClosingParen = function (matchStr) {
|
|
var endChar = matchStr.charAt(matchStr.length - 1);
|
|
var startChar;
|
|
if (endChar === ')') {
|
|
startChar = '(';
|
|
}
|
|
else if (endChar === ']') {
|
|
startChar = '[';
|
|
}
|
|
else if (endChar === '}') {
|
|
startChar = '{';
|
|
}
|
|
else {
|
|
return false; // not a close parenthesis or square bracket
|
|
}
|
|
// Find if there are the same number of open braces as close braces in
|
|
// the URL string, minus the last character (which we have already
|
|
// determined to be either ')', ']' or '}'
|
|
var numOpenBraces = 0;
|
|
for (var i = 0, len = matchStr.length - 1; i < len; i++) {
|
|
var char = matchStr.charAt(i);
|
|
if (char === startChar) {
|
|
numOpenBraces++;
|
|
}
|
|
else if (char === endChar) {
|
|
numOpenBraces = Math.max(numOpenBraces - 1, 0);
|
|
}
|
|
}
|
|
// If the number of open braces matches the number of close braces in
|
|
// the URL minus the last character, then the match has *unbalanced*
|
|
// braces because of the last character. Example of unbalanced braces
|
|
// from the regex match:
|
|
// "http://example.com?a[]=1]"
|
|
if (numOpenBraces === 0) {
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
/**
|
|
* Determine if there's an invalid character after the TLD in a URL. Valid
|
|
* characters after TLD are ':/?#'. Exclude scheme matched URLs from this
|
|
* check.
|
|
*
|
|
* @protected
|
|
* @param {String} urlMatch The matched URL, if there was one. Will be an
|
|
* empty string if the match is not a URL match.
|
|
* @param {String} schemeUrlMatch The match URL string for a scheme
|
|
* match. Ex: 'http://yahoo.com'. This is used to match something like
|
|
* 'http://localhost', where we won't double check that the domain name
|
|
* has at least one '.' in it.
|
|
* @return {Number} the position where the invalid character was found. If
|
|
* no such character was found, returns -1
|
|
*/
|
|
UrlMatcher.prototype.matchHasInvalidCharAfterTld = function (urlMatch, schemeUrlMatch) {
|
|
if (!urlMatch) {
|
|
return -1;
|
|
}
|
|
var offset = 0;
|
|
if (schemeUrlMatch) {
|
|
offset = urlMatch.indexOf(':');
|
|
urlMatch = urlMatch.slice(offset);
|
|
}
|
|
var re = new RegExp("^((.?\/\/)?[-." + alphaNumericAndMarksCharsStr + "]*[-" + alphaNumericAndMarksCharsStr + "]\\.[-" + alphaNumericAndMarksCharsStr + "]+)");
|
|
var res = re.exec(urlMatch);
|
|
if (res === null) {
|
|
return -1;
|
|
}
|
|
offset += res[1].length;
|
|
urlMatch = urlMatch.slice(res[1].length);
|
|
if (/^[^-.A-Za-z0-9:\/?#]/.test(urlMatch)) {
|
|
return offset;
|
|
}
|
|
return -1;
|
|
};
|
|
return UrlMatcher;
|
|
}(Matcher));
|
|
export { UrlMatcher };
|
|
|
|
//# sourceMappingURL=url-matcher.js.map
|