Skip to content

Commit

Permalink
Fix whitespace catastrophic backtracking
Browse files Browse the repository at this point in the history
  • Loading branch information
Lőrinc committed Feb 13, 2024
1 parent 5f07fc2 commit 51c8a8a
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 3 deletions.
2 changes: 1 addition & 1 deletion tests/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
@pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)
def test_extremely_big_encoding(make_enc: Callable[[], tiktoken.Encoding]):
enc = make_enc()
for c in ["^", "0", "a", "'s"]: # TODO " ", "\n" are still failing
for c in ["^", "0", "a", "'s", " ", "\n"]:
print(f"Validating `{c}`")

big_value = c * 10_000
Expand Down
4 changes: 2 additions & 2 deletions tiktoken_ext/openai_public.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# The pattern in the original GPT-2 release is:
# r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
# This is equivalent, but executes faster:
_legacy_splitter_regex = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s+(?!\S)|\s++"""
_legacy_splitter_regex = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s"""


def gpt2():
Expand Down Expand Up @@ -84,7 +84,7 @@ def cl100k_base():
}
return {
"name": "cl100k_base",
"pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s*[\r\n]|\s+(?!\S)|\s++""",
"pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s""",
"mergeable_ranks": mergeable_ranks,
"special_tokens": special_tokens,
}
Expand Down

0 comments on commit 51c8a8a

Please sign in to comment.