RegExpCreator class

/**
 * Creates regular expressions based on specified settings
 * @example
 * new RegExpCreator({caseSensitive: true, diacritics: false}).create('lorem');
 * // => /()(lorem)/gm
 */
class RegExpCreator {

  /**
   * @typedef RegExpCreator~accuracyObj
   * @type {object.<string>}
   * @property {string} value - An accuracy string value
   * @property {string[]} limiters - A custom array of limiters. For example
   * <code>["-", ","]</code>
   */
  /**
   * @typedef RegExpCreator~accuracy
   * @type {string}
   * @property {"partially"|"complementary"|"exactly"|RegExpCreator~accuracyObj}
   * [accuracy="partially"] - Either one of the following string values:
   * <ul>
   *   <li><i>partially</i>: When searching for "lor" only "lor" inside
   *   "lorem" will be marked</li>
   *   <li><i>complementary</i>: When searching for "lor" the whole word
   *   "lorem" will be marked</li>
   *   <li><i>exactly</i>: When searching for "lor" only those exact words
   *   will be marked. In this example nothing inside "lorem".
   * </ul>
   * Or an object containing two properties:
   * <ul>
   *   <li><i>value</i>: The value must be "exactly" or "complementary" or "startsWith"</li>
   *   <li><i>limiters</i>: A custom array of string limiters</li>
   * </ul>
   */
  /**
   * @typedef RegExpCreator~wildcards
   * @type {string}
   * @property {"disabled"|"enabled"|"withSpaces"}
   * [wildcards="disabled"] - Set to any of the following string values:
   * <ul>
   *   <li><i>disabled</i>: Disable wildcard usage</li>
   *   <li><i>enabled</i>: When searching for "lor?m", the "?" will match zero
   *   or one non-space character (e.g. "lorm", "loram", "lor3m", etc). When
   *   searching for "lor*m", the "*" will match zero or more non-space
   *   characters (e.g. "lorm", "loram", "lor123m", etc).</li>
   *   <li><i>withSpaces</i>: When searching for "lor?m", the "?" will
   *   match zero or one space or non-space character (e.g. "lor m", "loram",
   *   etc). When searching for "lor*m", the "*" will match zero or more space
   *   or non-space characters (e.g. "lorm", "lore et dolor ipsum", "lor: m",
   *   etc).</li>
   * </ul>
   */
  /**
   * @typedef RegExpCreator~ignorePunctuation
   * @type {string[]}
   * @property {string} The strings in this setting will contain punctuation
   * marks that will be ignored:
   * <ul>
   *   <li>These punctuation marks can be between any characters, e.g. setting
   *   this option to <code>["'"]</code> would match "Worlds", "World's" and
   *   "Wo'rlds"</li>
   *   <li>One or more apostrophes between the letters would still produce a
   *   match (e.g. "W'o''r'l'd's").</li>
   *   <li>A typical setting for this option could be as follows:
   *   <pre>ignorePunctuation: ":;.,-–—‒_(){}[]!'\"+=".split(""),</pre> This
   *   setting includes common punctuation as well as a minus, en-dash,
   *   em-dash and figure-dash
   *   ({@link https://en.wikipedia.org/wiki/Dash#Figure_dash ref}), as well
   *   as an underscore.</li>
   * </ul>
   */

  /**
   * @typedef RegExpCreator~options
   * @type {object.<string>}
   * @property {boolean} [diacritics=true] - If diacritic characters should be
   * matched. ({@link https://en.wikipedia.org/wiki/Diacritic Diacritics})
   * @property {object.<string|string[]>} [synonyms] - An object with synonyms.
   * The key will be a synonym for the value and the value for the key
   * @property {RegExpCreator~accuracy} [accuracy]
   * @property {boolean} [caseSensitive=false] - Whether to search case sensitive
   * @property {boolean} [ignoreJoiners=false] - Whether to ignore word
   * joiners inside of key words. These include soft-hyphens, zero-width
   * space, zero-width non-joiners and zero-width joiners.
   * @property {RegExpCreator~ignorePunctuation} [ignorePunctuation]
   * @property {RegExpCreator~wildcards} [wildcards]
   */
  /**
   * @typedef RegExpCreator~patternObj
   * @type {object}
   * @property {string} lookbehind - A lookbehind capturing group
   * @property {string} pattern - A string pattern
   * @property {string} lookahead - A positive lookahead assertion
   */
  /**
   * @param {RegExpCreator~options} [options] - Optional options object
   */
  constructor(options) {
    this.opt = Object.assign({}, {
      'diacritics': true,
      'synonyms': {},
      'accuracy': 'partially',
      'caseSensitive': false,
      'ignoreJoiners': false,
      'ignorePunctuation': [],
      'wildcards': 'disabled'
    }, options);
  }

  /**
   * The array with lower and upper cases diacritics characters
   * @type {string[]}
   * @access protected
   */
  get chars() {
    if ( !this._chars) {
      this._chars = [];
      // initialises an array with lower and upper cases diacritics characters
      ['aàáảãạăằắẳẵặâầấẩẫậäåāą', 'cçćč', 'dđď', 'eèéẻẽẹêềếểễệëěēę',
        'iìíỉĩịîïī',  'lł', 'nñňń', 'oòóỏõọôồốổỗộơởỡớờợöøōő',  'rř',
        'sšśșş', 'tťțţ', 'uùúủũụưừứửữựûüůūű', 'yýỳỷỹỵÿ', 'zžżź'].forEach(str => {
        this._chars.push(str, str.toUpperCase());
      });
    }
    return this._chars;
  }

  /**
   * Creates a regular expression to match the specified search term considering
   * the available option settings
   * @param {string} str - The search term to be used
   * @param {boolean} patterns - Whether to return an object with pattern parts or RegExp object
   * @return {RegExpCreator~patternObj|RegExp}
   */
  create(str, patterns) {
    const flags = 'g' + (this.opt.caseSensitive ? '' : 'i');

    str = this.checkWildcardsEscape(str);
    str = this.createSynonyms(str, flags);

    const joiners = this.getJoinersPunctuation();

    if (joiners) {
      str = this.setupIgnoreJoiners(str);
    }

    if (this.opt.diacritics) {
      str = this.createDiacritics(str);
    }
    str = str.replace(/\s+/g, '[\\s]+');

    if (joiners) {
      str = this.createJoiners(str, joiners);
    }

    if (this.opt.wildcards !== 'disabled') {
      str = this.createWildcards(str);
    }

    const obj = this.createAccuracy(str);

    return (patterns
      ? obj
      : new RegExp(`${obj.lookbehind}(${obj.pattern})${obj.lookahead}`, flags));
  }

  /**
    * Creates a single combine pattern from an array of string considering the available option settings
    * @param {Array} array - The array of string
    * @param {boolean} capture - Whether to wrap an individual pattern in a capturing or non-capturing group
    * @return {RegExpCreator~patternObj|null}
    */
  createCombinePattern(array, capture) {
    if ( !Array.isArray(array) || !array.length) {
      return null;
    }
    const group = capture ? '(' : '(?:',
      obj = this.create(array[0], true);
    obj.pattern = this.distinct(array.map(str => `${group}${this.create(str, true).pattern})`)).join('|');

    return obj;
  }

  /**
   * Sort array from longest entry to shortest
   * @param {array} arry - The array to sort
   * @return {array}
   */
  sortByLength(arry) {
    return arry.sort((a, b) => a.length === b.length ?
      // sort a-z for same length elements
      (a > b ? 1 : -1) :
      b.length - a.length
    );
  }

  /**
   * Escapes RegExp special characters
   * @param {string} str - The string to escape
   * @return {string}
   */
  escape(str) {
    return str.replace(/[[\]/{}()*+?.\\^$|]/g, '\\$&');
  }

  /**
   * Splits string if val is string, removes duplicates, escape '-^]\\' which are special in RegExp characters set
   * @param {array|string} val - The parameter to process
   * @return {string}
   */
  preprocess(val) {
    if (val && val.length) {
      return this.distinct(typeof val === 'string' ? val.split('') : val).join('').replace(/[-^\]\\]/g, '\\$&');
    }
    return '';
  }

  /**
   * Removes duplicate or empty entries
   * @param {array} array - The array to process
   * @return {array}
   */
  distinct(array) {
    const result = [];
    array.forEach(item => {
      if (item.trim() && result.indexOf(item) === -1) {
        result.push(item);
      }
    });
    return result;
  }

  /**
   * Creates a regular expression string to match the defined synonyms
   * @param {string} str - The search term to be used
   * @return {string}
   */
  createSynonyms(str, flags) {
    const syn = this.opt.synonyms;

    if ( !Object.keys(syn).length) {
      return str;
    }

    for (const key in syn) {
      if (syn.hasOwnProperty(key)) {
        let array = Array.isArray(syn[key]) ? syn[key] : [syn[key]];
        array.unshift(key);
        array = this.sortByLength(this.distinct(array)).map(term => this.checkWildcardsEscape(term));

        if (array.length > 1) {
          const pattern = array.map(k => this.escape(k)).join('|');
          str = str.replace(new RegExp(pattern, flags), `(?:${array.join('|')})`);
        }
      }
    }
    return str;
  }

  /**
   * Check wildcards option creates placeholders in the regular expression string to allow later
   * insertion of wildcard patterns and escapes RegExp special characters
   * @param {string} str - The search term
   * @return {string}
   */
  checkWildcardsEscape(str) {
    if (this.opt.wildcards !== 'disabled') {
      // replaces single character wildcard with \x01, multiple character wildcard with \x02
      str = str.replace(/(\\.)+|[?*]/g, (m, gr) => gr ? m : m === '?' ? '\x01' : '\x02')
        // removes one backslash character before '?', '*', '\x01', and '\x02'
        .replace(/\\+(?=[?*\x01\x02])/g, m => m.slice(1));
    }
    return this.escape(str);
  }

  /**
   * Replaces the wildcard placeholders in a regular expression string
   * @param {string} str - The search term to be used
   * @return {string}
   */
  createWildcards(str) {
    // default to "enable" (i.e. to not include spaces)
    // "withSpaces" uses `[^]` instead of `.` because the latter does not match new line characters
    // or `[^\x01]` if blockElementsBoundary option is enabled
    const spaces = this.opt.wildcards === 'withSpaces',
      boundary = this.opt.blockElementsBoundary,
      anyChar = `[^${spaces && boundary ? '\x01' : ''}]*?`;

    return str
    // replace \x01 with a RegExp class to match any single
    // character, or any single non-whitespace character depending
    // on the setting
      .replace(/\x01/g, spaces ? '[^]?' : '\\S?')
      // replace \x02 with a RegExp class to match zero or
      // more characters, or zero or more non-whitespace characters
      // depending on the setting
      .replace(/\x02/g, spaces ? anyChar : '\\S*');
  }

  /**
   * Creates placeholders in the regular expression string to allow later insertion of
   * designated characters (soft hyphens, zero width characters, and punctuation)
   * @param {string} str - The search term to be used
   * @return {string}
   */
  setupIgnoreJoiners(str) {
    // it's not added '\0' after `(?:` grouping construct, around `|` char and wildcard `\x02` placeholder,
    // before `)` char, and at the end of a string,
    // not breaks the grouping construct `(?:`, continues pairs of backslashes, and UTF-16 surrogate pairs
    const reg = /((?:\\\\)+|\x02|\(\?:|\|)|\\?(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|.)(?=([|)\x02]|$)|.)/g;
    return str.replace(reg, (m, gr1, gr2) => {
      return gr1 || typeof gr2 !== 'undefined' ? m : m + '\x00';
    });
  }

  /**
   * Replaces '\x00' placeholders in a regular expression string by designated
   * characters (soft hyphens, zero width characters, and punctuation) based on the
   * specified option values of <code>ignorePunctuation</code> and
   * <code>ignoreJoiners</code>
   * @param {string} str - The search term to be used
   * @return {string}
   */
  createJoiners(str, joiners) {
    return str.split(/\x00+/).join(`[${joiners}]*`);
  }

  /**
   * Creates a punctuation and/or joiners pattern
   * @return {string}
   */
  getJoinersPunctuation() {
    let punct = this.preprocess(this.opt.ignorePunctuation),
      str = punct ? punct : '';

    if (this.opt.ignoreJoiners) {
      // u+00ad = soft hyphen
      // u+200b = zero-width space
      // u+200c = zero-width non-joiner
      // u+200d = zero-width joiner
      str += '\\u00ad\\u200b\\u200c\\u200d';
    }
    return str;
  }

  /**
   * Creates a regular expression string to match diacritics
   * @param {string} str - The search term to be used
   * @return {string}
   */
  createDiacritics(str) {
    const array = this.chars;

    return str.split('').map(ch => {
      for (let i = 0; i < array.length; i += 2) {
        const lowerCase = array[i].indexOf(ch) !== -1;

        if (this.opt.caseSensitive) {
          if (lowerCase) {
            return '[' + array[i] + ']';

          } else if (array[i+1].indexOf(ch) !== -1) {
            return '[' + array[i+1] + ']';
          }
        } else if (lowerCase || array[i+1].indexOf(ch) !== -1) {
          return '[' + array[i] + array[i+1] + ']';
        }
      }
      return ch;
    }).join('');
  }

  /**
   * Creates a regular expression string to match the specified string with the
   * defined accuracy. All regular expressions created with two capturing groups.
   * The first group is ignored (serves as lookbehind with values 'exactly' and 'startsWith'),
   * the second is contained the actual match
   * @param {string} str - The search term to be used
   * @return {RegExpCreator~patternObj}
   */
  createAccuracy(str) {
    const chars = '!-/:-@[-`{-~¡¿'; // '!"#$%&\'()*+,\\-./:;<=>?@[\\]\\\\^_`{|}~¡¿';
    let accuracy = this.opt.accuracy,
      lookbehind = '()',
      pattern = str,
      lookahead = '',
      limiters;

    if (typeof accuracy !== 'string') {
      limiters = this.preprocess(accuracy.limiters);
      accuracy = accuracy.value;
    }

    if (accuracy === 'exactly') {
      const charSet = limiters ? '[\\s' + limiters + ']' : '\\s';
      lookbehind = `(^|${charSet})`;
      lookahead = `(?=$|${charSet})`;

    } else {
      const chs = limiters || chars,
        charSet = `[^\\s${chs}]*`;

      if (accuracy === 'complementary') {
        pattern = charSet + str + charSet;

      } else if (accuracy === 'startsWith') {
        lookbehind = `(^|[\\s${chs}])`;
        pattern = str.split(/\[\\s\]\+/g).join(charSet + '[\\s]+') + charSet;
      }
    }
    return { lookbehind, pattern, lookahead };
  }
}

export default RegExpCreator;