te')); return $arr; } /* 遍历用户所有主题 * @param $uid 用户ID * @param int $page 页数 * @param int $pagesize 每页记录条数 * @param bool $desc 排序方式 TRUE降序 FALSE升序 * @param string $key 返回的数组用那一列的值作为 key * @param array $col 查询哪些列 */ function thread_tid_find_by_uid($uid, $page = 1, $pagesize = 1000, $desc = TRUE, $key = 'tid', $col = array()) { if (empty($uid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('uid' => $uid), array('tid' => $orderby), $page, $pagesize, $key, $col); return $arr; } // 遍历栏目下tid 支持数组 $fid = array(1,2,3) function thread_tid_find_by_fid($fid, $page = 1, $pagesize = 1000, $desc = TRUE) { if (empty($fid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('fid' => $fid), array('tid' => $orderby), $page, $pagesize, 'tid', array('tid', 'verify_date')); return $arr; } function thread_tid_delete($tid) { if (empty($tid)) return FALSE; $r = thread_tid__delete(array('tid' => $tid)); return $r; } function thread_tid_count() { $n = thread_tid__count(); return $n; } // 统计用户主题数 大数量下严谨使用非主键统计 function thread_uid_count($uid) { $n = thread_tid__count(array('uid' => $uid)); return $n; } // 统计栏目主题数 大数量下严谨使用非主键统计 function thread_fid_count($fid) { $n = thread_tid__count(array('fid' => $fid)); return $n; } ?>python - Negative lookahead regex in `re.subn()` context - Stack Overflow
最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

python - Negative lookahead regex in `re.subn()` context - Stack Overflow

programmeradmin3浏览0评论

I am trying to use regular expressions to replace numeric ranges in text, such as "4-5", with the phrase "4 to 5".

The text also contains dates such as "2024-12-26" that should not be replaced (should be left as is).

The regular expression (\d+)(\-)(\d+) (attempt one below) is clearly wrong, because it falsely matches dates.

Using a negative lookahead expression, I came up with the regex (?!\d+\-\d+\-)(\d+)(\-)(\d+) instead (attempt two below), which correctly matches "4-5" while rejecting "2024-12-26".

However, attempt_two does not behave correctly in a re.subn() context, because although it rejects "2024-12-26", the search continues on to match (and replace) the substring "12-26":

import re

text = """
2024-12-26
4-5
78-79
"""

attempt_one = repile(r"(\d+)(\-)(\d+)")
attempt_two = repile(r"(?!\d+\-\d+\-)(\d+)(\-)(\d+)")

print("Attempt one:")
print(re.match(attempt_one, "4-5"))  # Match: OK
print(re.match(attempt_one, "2024-12-26"))  # Match: False positive
new_text, _ = re.subn(attempt_one, r"\1 to \3", text)  # Incorrect substitution
print(new_text)

print("Attempt two:")
print(re.match(attempt_two, "4-5"))  # Match: OK
print(re.match(attempt_two, "2024-12-26"))  # Doesn't match: OK
new_text, _ = re.subn(attempt_two, r"\1 to \3", text)  # Still incorrect
print(new_text)

Output:

Attempt one:
<re.Match object; span=(0, 3), match='4-5'>
<re.Match object; span=(0, 7), match='2024-12'>

2024 to 12-26
4 to 5
78 to 79

Attempt two:
<re.Match object; span=(0, 3), match='4-5'>
None

2024-12 to 26
4 to 5
78 to 79

What regular expression can I use so that the substitution returns the following instead?

2024-12-26
4 to 5
78 to 79

(As my goal is to learn about regular expressions, I am not interested in workarounds such as matching the whitespace or newline after "12-26".)

I am trying to use regular expressions to replace numeric ranges in text, such as "4-5", with the phrase "4 to 5".

The text also contains dates such as "2024-12-26" that should not be replaced (should be left as is).

The regular expression (\d+)(\-)(\d+) (attempt one below) is clearly wrong, because it falsely matches dates.

Using a negative lookahead expression, I came up with the regex (?!\d+\-\d+\-)(\d+)(\-)(\d+) instead (attempt two below), which correctly matches "4-5" while rejecting "2024-12-26".

However, attempt_two does not behave correctly in a re.subn() context, because although it rejects "2024-12-26", the search continues on to match (and replace) the substring "12-26":

import re

text = """
2024-12-26
4-5
78-79
"""

attempt_one = repile(r"(\d+)(\-)(\d+)")
attempt_two = repile(r"(?!\d+\-\d+\-)(\d+)(\-)(\d+)")

print("Attempt one:")
print(re.match(attempt_one, "4-5"))  # Match: OK
print(re.match(attempt_one, "2024-12-26"))  # Match: False positive
new_text, _ = re.subn(attempt_one, r"\1 to \3", text)  # Incorrect substitution
print(new_text)

print("Attempt two:")
print(re.match(attempt_two, "4-5"))  # Match: OK
print(re.match(attempt_two, "2024-12-26"))  # Doesn't match: OK
new_text, _ = re.subn(attempt_two, r"\1 to \3", text)  # Still incorrect
print(new_text)

Output:

Attempt one:
<re.Match object; span=(0, 3), match='4-5'>
<re.Match object; span=(0, 7), match='2024-12'>

2024 to 12-26
4 to 5
78 to 79

Attempt two:
<re.Match object; span=(0, 3), match='4-5'>
None

2024-12 to 26
4 to 5
78 to 79

What regular expression can I use so that the substitution returns the following instead?

2024-12-26
4 to 5
78 to 79

(As my goal is to learn about regular expressions, I am not interested in workarounds such as matching the whitespace or newline after "12-26".)

Share Improve this question asked Feb 17 at 19:09 MaxMax 8778 silver badges20 bronze badges 2
  • You wanna swap month and day? – aaa Commented Feb 17 at 19:50
  • Why do you use subn instead of sub? Does the issue not happen with sub? – no comment Commented Feb 17 at 20:57
Add a comment  | 

3 Answers 3

Reset to default 3

You need both a negative lookbehind and a negative lookahead, to prohibit an extra hyphen before or after the match.

(?<![-\d])(\d+)-(\d+)(?![-\d])

The lookarounds also have to match digits, so it won't match part of the date, e.g. 024-1 from 2024-12-26.

It seems like you want to replace ranges; in that case, you can do it without using lookaround, which makes it easier to debug and simpler to translate to other languages.

import re

text = """
2024-12-26
2024 - 12 - 26
2024 - 12 - 26 - 26 
4-5
78-79
4 to 5  43 to     15
4 - 5  43 -     15

4-5  43-15
"""


def substitute(text):
    pattern = r'((?:\d+\s*[-]\s*){2,}\d+)|(\d+)\s*[-]\s*(\d+)'
    res = re.subn(pattern, lambda m: f"{m.group(2)} to {m.group(3)}" if m.group(
        2) and m.group(3) else m.group(1), text)
    return res[0]


print(substitute(text))

Prints:


2024-12-26
2024 - 12 - 26
2024 - 12 - 26 - 26 
4 to 5
78 to 79
4 to 5  43 to     15
4 to 5  43 to 15

4 to 5  43 to 15



Details are in this link

You could use this pattern anywhere in a text match a range [from_number]-[to_number]:

PYTHON CODE (re module)

import re

pattern = r"\b(?<!\d-)(\d+)(-)(\d+)(?!-\d)\b"
pattern_re = repile(pattern)
replacement = r"\1 to \3"

new_text = pattern_re.subn(replacement, text)

[print(x) for x in new_text]

PATTERN DEMO: https://regex101/r/m4oCWQ/1

NOTES

  • \b Word boundary. Make sure there is not a alphanumberical character or underscore _ before the first digit \d. T his makes sure that the first number is not following another number (or letter or _), so it cannot start matching between the digits \d.
  • (?!\d-) Negative lookbehind The match may not be preseded by a digit \d followed by a dash -.
  • (\d+) First capture group for one or more (+) digits \d. In the replacement string, referred to with \1.
  • (-) Second capture group. In the replacement string, replaced with 'to'.
  • (\d+) Third capture group for one or more (+) digits \d. In the replacement string, referred to with \3.
  • (?!-\d) Negative lookahead: The match may not be followed by a dash - followed by a digit \d.
  • \b Word boundary. Make sure there is not a alphanumberical character or underscore _ after the last digit \d of the match . This makes sure that the last number is not followed by another number (or letter or _), i.e. match cannot end between two digits (in the middle of the number digits).

TEXT

text = """
2024-12-26
4-5
78-79
45-45
4-55
44-5
The sun is shining on 12-20-2027
The sun was shining from 6-15
I saw the moon 4-3
What if there is 20-12-23-20221 or 03-03-4? 
What about 3-2?
"""

RESULT

2024-12-26
4 to 5
78 to 79
45 to 45
4 to 55
44 to 5
The sun is shining on 12-20-2027
The sun was shining from 6 to 15
I saw the moon 4 to 3
What if there is 20-12-23-20221 or 03-03-4?
What about 3 to 2?

8

发布评论

评论列表(0)

  1. 暂无评论