模式匹配和数字提取

问题描述 投票:0回答:1

我创建了以下代码来从用户提供的字符串中提取数字信息,该字符串指定建筑物中的楼层或楼层。目标是从输入中准确提取数值。但是,当前的实现无法正确处理带连字符的数字。例如,“二十三”被错误地解析为 20 而不是 23。

function extractLevelFromString(input) {
    // Normalize the input string
    const normalizedInput = input.toLowerCase();

    
    const wordToNumberMap = {
        "one": 1, "first": 1,
        "two": 2, "second": 2,
        "three": 3, "third": 3,
        "four": 4, "fourth": 4,
        "five": 5, "fifth": 5,
        "six": 6, "sixth": 6,
        "seven": 7, "seventh": 7,
        "eight": 8, "eighth": 8,
        "nine": 9, "ninth": 9,
        "ten": 10, "tenth": 10,
        "eleven": 11, "eleventh": 11,
        "twelve": 12, "twelfth": 12,
        "thirteen": 13, "thirteenth": 13,
        "fourteen": 14, "fourteenth": 14,
        "fifteen": 15, "fifteenth": 15,
        "sixteen": 16, "sixteenth": 16,
        "seventeen": 17, "seventeenth": 17,
        "eighteen": 18, "eighteenth": 18,
        "nineteen": 19, "nineteenth": 19,
        "twenty": 20, "twentieth": 20,
        "twenty-one": 21, "twenty-first": 21,
        "twenty-two": 22, "twenty-second": 22,
        "twenty-three": 23, "twenty-third": 23,
        "twenty-four": 24, "twenty-fourth": 24,
        "twenty-five": 25, "twenty-fifth": 25,
        "twenty-six": 26, "twenty-sixth": 26,
        "twenty-seven": 27, "twenty-seventh": 27,
        "twenty-eight": 28, "twenty-eighth": 28,
        "twenty-nine": 29, "twenty-ninth": 29,
        "thirty": 30, "thirtieth": 30,
        "thirty-one": 31, "thirty-first": 31,
        "thirty-two": 32, "thirty-second": 32,
        "thirty-three": 33, "thirty-third": 33,
        "thirty-four": 34, "thirty-fourth": 34,
        "thirty-five": 35, "thirty-fifth": 35,
        "thirty-six": 36, "thirty-sixth": 36,
        "thirty-seven": 37, "thirty-seventh": 37,
        "thirty-eight": 38, "thirty-eighth": 38,
        "thirty-nine": 39, "thirty-ninth": 39,
        "forty": 40, "fortieth": 40,
        "forty-one": 41, "forty-first": 41,
        "forty-two": 42, "forty-second": 42,
        "forty-three": 43, "forty-third": 43,
        "forty-four": 44, "forty-fourth": 44,
        "forty-five": 45, "forty-fifth": 45,
        "forty-six": 46, "forty-sixth": 46,
        "forty-seven": 47, "forty-seventh": 47,
        "forty-eight": 48, "forty-eighth": 48,
        "forty-nine": 49, "forty-ninth": 49,
        "fifty": 50, "fiftieth": 50
    };
    

    const levelRegex = /\b(level|floor|on|at)?\s*(\d+|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|twenty-one|twenty-two|twenty-three|twenty-four|twenty-five|twenty-six|twenty-seven|twenty-eight|twenty-nine|thirty|thirty-one|thirty-two|thirty-three|thirty-four|thirty-five|thirty-six|thirty-seven|thirty-eight|thirty-nine|forty|forty-one|forty-two|forty-three|forty-four|forty-five|forty-six|forty-seven|forty-eight|forty-nine|fifty|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|twenty-first|twenty-second|twenty-third|twenty-fourth|twenty-fifth|twenty-sixth|twenty-seventh|twenty-eighth|twenty-ninth|thirtieth|thirty-first|thirty-second|thirty-third|thirty-fourth|thirty-fifth|thirty-sixth|thirty-seventh|thirty-eighth|thirty-ninth|fortieth|forty-first|forty-second|forty-third|forty-fourth|forty-fifth|forty-sixth|forty-seventh|forty-eighth|forty-ninth|fiftieth)(?:st|nd|rd|th)?\b/gi;
 
    const matches = normalizedInput.matchAll(levelRegex);

    // Process matches
    for (const match of matches) {
        const levelCandidate = match[2]; // Get the potential level part

        // If numeric, return directly
        if (!isNaN(levelCandidate)) {
            return parseInt(levelCandidate, 10);
        }

        // If word-based, map to a number
        if (wordToNumberMap[levelCandidate]) {
            return wordToNumberMap[levelCandidate];
        }
    }

    // Return null if no level found
    return null;
}

我使用正则表达式模式匹配尝试了此操作,并期望从输入字符串中解析数字。

javascript regex string pattern-matching
1个回答
0
投票

问题

当字符串为

"twenty"
"thirty"
时,正则表达式会贪婪地匹配
"twenty-three"
"thirty-third"
等字符串值。

解决方案建议

更新正则表达式以包含负向前瞻,以便仅在

不立即
后跟连字符时匹配 "twenty""thirty" 等值。

示例:

const levelRegex =
  /\b(level|floor|on|at)?\s*(\d+|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|(twenty|thirty|forty|fifty)(?!-)|(twenty|thirty|forty|fifty)-(one|first|two|second|three|third|four|fourth|five|fifth|six|sixth|seven|seventh|eight|eigth|nine|ninth)|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|thirtieth|fortieth|fiftieth)(?:st|nd|rd|th)?\b/gi;

这里我还对一些值进行了分组以减少重复的子字符串。

function extractLevelFromString(input) {
  // Normalize the input string
  const normalizedInput = input.toLowerCase();

  const wordToNumberMap = {
    one: 1,
    first: 1,
    two: 2,
    second: 2,
    three: 3,
    third: 3,
    four: 4,
    fourth: 4,
    five: 5,
    fifth: 5,
    six: 6,
    sixth: 6,
    seven: 7,
    seventh: 7,
    eight: 8,
    eighth: 8,
    nine: 9,
    ninth: 9,
    ten: 10,
    tenth: 10,
    eleven: 11,
    eleventh: 11,
    twelve: 12,
    twelfth: 12,
    thirteen: 13,
    thirteenth: 13,
    fourteen: 14,
    fourteenth: 14,
    fifteen: 15,
    fifteenth: 15,
    sixteen: 16,
    sixteenth: 16,
    seventeen: 17,
    seventeenth: 17,
    eighteen: 18,
    eighteenth: 18,
    nineteen: 19,
    nineteenth: 19,
    twenty: 20,
    twentieth: 20,
    "twenty-one": 21,
    "twenty-first": 21,
    "twenty-two": 22,
    "twenty-second": 22,
    "twenty-three": 23,
    "twenty-third": 23,
    "twenty-four": 24,
    "twenty-fourth": 24,
    "twenty-five": 25,
    "twenty-fifth": 25,
    "twenty-six": 26,
    "twenty-sixth": 26,
    "twenty-seven": 27,
    "twenty-seventh": 27,
    "twenty-eight": 28,
    "twenty-eighth": 28,
    "twenty-nine": 29,
    "twenty-ninth": 29,
    thirty: 30,
    thirtieth: 30,
    "thirty-one": 31,
    "thirty-first": 31,
    "thirty-two": 32,
    "thirty-second": 32,
    "thirty-three": 33,
    "thirty-third": 33,
    "thirty-four": 34,
    "thirty-fourth": 34,
    "thirty-five": 35,
    "thirty-fifth": 35,
    "thirty-six": 36,
    "thirty-sixth": 36,
    "thirty-seven": 37,
    "thirty-seventh": 37,
    "thirty-eight": 38,
    "thirty-eighth": 38,
    "thirty-nine": 39,
    "thirty-ninth": 39,
    forty: 40,
    fortieth: 40,
    "forty-one": 41,
    "forty-first": 41,
    "forty-two": 42,
    "forty-second": 42,
    "forty-three": 43,
    "forty-third": 43,
    "forty-four": 44,
    "forty-fourth": 44,
    "forty-five": 45,
    "forty-fifth": 45,
    "forty-six": 46,
    "forty-sixth": 46,
    "forty-seven": 47,
    "forty-seventh": 47,
    "forty-eight": 48,
    "forty-eighth": 48,
    "forty-nine": 49,
    "forty-ninth": 49,
    fifty: 50,
    fiftieth: 50,
  };

  const levelRegex =
    /\b(level|floor|on|at)?\s*(\d+|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|(twenty|thirty|forty|fifty)(?!-)|(twenty|thirty|forty|fifty)-(one|first|two|second|three|third|four|fourth|five|fifth|six|sixth|seven|seventh|eight|eigth|nine|ninth)|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|thirtieth|fortieth|fiftieth)(?:st|nd|rd|th)?\b/gi;

  const matches = normalizedInput.matchAll(levelRegex);

  // Process matches
  for (const match of matches) {
    const levelCandidate = match[2]; // Get the potential level part

    // If numeric, return directly
    if (!isNaN(levelCandidate)) {
      return parseInt(levelCandidate, 10);
    }

    // If word-based, map to a number
    if (wordToNumberMap[levelCandidate]) {
      return wordToNumberMap[levelCandidate];
    }
  }

  // Return null if no level found
  return null;
}

console.log(extractLevelFromString("twenty-third"));

© www.soinside.com 2019 - 2024. All rights reserved.