From e6b28362b08a85a4fc307657aef0eb7d6e0174ae Mon Sep 17 00:00:00 2001 From: Bulhwi Cha Date: Sun, 31 Mar 2024 18:10:54 +0900 Subject: [PATCH] feat: add `String.splitOn_of_valid` Add `String.splitOnAux_of_valid` and `String.splitOn_of_valid`. --- Std/Data/String/Lemmas.lean | 157 +++++++++++++++++++++++++++++++++++- 1 file changed, 153 insertions(+), 4 deletions(-) diff --git a/Std/Data/String/Lemmas.lean b/Std/Data/String/Lemmas.lean index e87013574d..76fd7b8324 100644 --- a/Std/Data/String/Lemmas.lean +++ b/Std/Data/String/Lemmas.lean @@ -4,10 +4,9 @@ Released under Apache 2.0 license as described in the file LICENSE. Authors: Bulhwi Cha, Mario Carneiro -/ import Std.Data.Char -import Std.Data.Nat.Lemmas -import Std.Data.List.Lemmas +import Std.Data.List.SplitOnList import Std.Data.String.Basic -import Std.Tactic.Lint.Misc +import Std.Logic import Std.Tactic.SeqFocus @[simp] theorem Char.length_toString (c : Char) : c.toString.length = 1 := rfl @@ -79,6 +78,9 @@ private theorem ne_self_add_add_csize : i ≠ i + (n + csize c) := @[simp] theorem utf8Len_eq_zero : utf8Len l = 0 ↔ l = [] := by cases l <;> simp [Nat.ne_of_gt add_csize_pos] +theorem utf8Len_pos : 0 < utf8Len l ↔ l ≠ [] := by + cases l <;> simp [add_csize_pos] + section open List theorem utf8Len_le_of_sublist : ∀ {cs₁ cs₂}, cs₁ <+ cs₂ → utf8Len cs₁ ≤ utf8Len cs₂ @@ -484,7 +486,154 @@ theorem splitAux_of_valid (p l m r acc) : theorem split_of_valid (s p) : split s p = (List.splitOnP p s.1).map mk := by simpa [split] using splitAux_of_valid p [] [] s.1 [] --- TODO: splitOn +/- +NOTE: When the latest Lean toolchain has the fixed versions of `splitOnAux` and `splitOn`, remove +this section and replace `splitOnAux'` and `splitOn'` with `splitOnAux` and `splitOn`, respectively. +See https://github.com/leanprover/lean4/pull/3832. +-/ +section DELETE_THIS_LATER + +/-- +Auxiliary for `splitOn`. Preconditions: +* `sep` is not empty +* `b <= i` are indexes into `s` +* `j` is an index into `sep`, and not at the end + +It represents the state where we have currently parsed some split parts into `r` (in reverse order), +`b` is the beginning of the string / the end of the previous match of `sep`, and the first `j` bytes +of `sep` match the bytes `i-j .. i` of `s`. +-/ +def splitOnAux' (s sep : String) (b : Pos) (i : Pos) (j : Pos) (r : List String) : List String := + if s.atEnd i then + let r := (s.extract b i)::r + r.reverse + else + if s.get i == sep.get j then + let i := s.next i + let j := sep.next j + if sep.atEnd j then + splitOnAux' s sep i i 0 (s.extract b (i - j)::r) + else + splitOnAux' s sep b i j r + else + splitOnAux' s sep b (s.next (i - j)) 0 r +termination_by (s.endPos.1 - (i - j).1, sep.endPos.1 - j.1) +decreasing_by + all_goals simp_wf + focus + rename_i h _ _ + left; exact Nat.sub_lt_sub_left + (Nat.lt_of_le_of_lt (Nat.sub_le ..) (Nat.gt_of_not_le (mt decide_eq_true h))) + (Nat.lt_of_le_of_lt (Nat.sub_le ..) (lt_next s _)) + focus + rename_i i₀ j₀ _ eq h' + repeat rw [← Pos.sub_byteIdx] + rw [show (s.next i₀ - sep.next j₀).1 = (i₀ - j₀).1 by + show (_ + csize _) - (_ + csize _) = _ + rw [(beq_iff_eq ..).1 eq, Nat.add_sub_add_right]; rfl] + right; exact Nat.sub_lt_sub_left + (Nat.lt_of_le_of_lt (Nat.le_add_right ..) (Nat.gt_of_not_le (mt decide_eq_true h'))) + (lt_next sep _) + focus + rename_i h _ + left; exact Nat.sub_lt_sub_left + (Nat.lt_of_le_of_lt (Nat.sub_le ..) (Nat.gt_of_not_le (mt decide_eq_true h))) + (lt_next s _) + +/-- +Splits a string `s` on occurrences of the separator `sep`. When `sep` is empty, it returns `[s]`; +when `sep` occurs in overlapping patterns, the first match is taken. There will always be exactly +`n+1` elements in the returned list if there were `n` nonoverlapping matches of `sep` in the string. +The default separator is `" "`. The separators are not included in the returned substrings. + +``` +"here is some text ".splitOn = ["here", "is", "some", "text", ""] +"here is some text ".splitOn "some" = ["here is ", " text "] +"here is some text ".splitOn "" = ["here is some text "] +"ababacabac".splitOn "aba" = ["", "bac", "c"] +``` +-/ +def splitOn' (s : String) (sep : String := " ") : List String := + if sep == "" then [s] else splitOnAux' s sep 0 0 0 [] + +end DELETE_THIS_LATER + +theorem splitOnAux_of_valid (sep₁ sep₂ l m r acc) (h : sep₂ ≠ []) : + splitOnAux' ⟨l ++ m ++ sep₁ ++ r⟩ ⟨sep₁ ++ sep₂⟩ ⟨utf8Len l⟩ ⟨utf8Len (l ++ m ++ sep₁)⟩ + ⟨utf8Len sep₁⟩ acc = + acc.reverse ++ (List.modifyHead (m ++ ·) <| List.splitOnListAux r sep₁ sep₂ h).map mk := by + obtain ⟨c₂, sep₂, rfl⟩ := sep₂.exists_cons_of_ne_nil h + rw [splitOnAux', List.splitOnListAux] + simp only [List.append_assoc, utf8Len_append, atEnd_iff, endPos_eq, Pos.mk_le_mk, + Nat.add_le_add_iff_left, Nat.add_right_le_self, utf8Len_eq_zero, by simpa only + [List.append_assoc, utf8Len_append] using (⟨get_of_valid (l++m++sep₁) r, next_of_valid + (l++m++sep₁) (r.headD default) r, extract_of_valid l (m++sep₁) r⟩ : _∧_∧_), List.reverse_cons, + get_of_valid, List.headD_cons, beq_iff_eq, next, Pos.addChar_eq, utf8Len_cons, + Nat.add_left_le_self, Pos.sub_eq, dite_eq_ite] + split + · simp + · next hls => + obtain ⟨cᵣ, r, rfl⟩ := r.exists_cons_of_ne_nil hls + simp only [List.headD_cons, List.head_cons, List.tail_cons, false_or] + split + · next hc => + subst c₂ + rw [← Nat.add_assoc, Nat.add_assoc, Nat.add_sub_cancel, Nat.add_assoc] + split + · subst sep₂ + have ih := by simpa only [List.append_assoc, List.append_nil, List.singleton_append, + List.nil_append, utf8Len_append, utf8Len_cons, utf8Len_nil, Nat.zero_add, Pos.mk_zero, + List.reverse_cons, ← Function.id_def, List.modifyHead_id] using (splitOnAux_of_valid [] + (sep₁++[cᵣ]) (l++m++sep₁++[cᵣ]) [] r (⟨m⟩::acc) (by simp)) + simp [by simpa using extract_of_valid l m (sep₁++cᵣ::r), ih] + · next hsp₂ => + simpa using splitOnAux_of_valid (sep₁++[cᵣ]) sep₂ l m r acc hsp₂ + · next hc => + have hget := by simpa only [List.append_assoc, utf8Len_append] + using get_of_valid (l++m) (sep₁++cᵣ::r) + have ih := by simpa only [List.append_assoc, List.append_nil, List.singleton_append, + List.nil_append, utf8Len_append, utf8Len_cons, utf8Len_nil, Nat.zero_add, Pos.mk_zero, + List.head_cons_tail (sep₁++cᵣ::r) (by simp)] using (splitOnAux_of_valid [] (sep₁++c₂::sep₂) + l (m++[(sep₁++cᵣ::r).head (by simp)]) (sep₁++cᵣ::r).tail acc (by simp)) + rw [Nat.add_sub_assoc, Nat.add_sub_cancel, hget, List.headD_eq_head (sep₁++cᵣ::r) (by simp), + Nat.add_assoc, ih, List.append_cancel_left_eq, List.modifyHead_modifyHead, + Function.comp_def]; clear hget ih + apply Nat.le_add_left +termination_by (utf8Len sep₁ + utf8Len r, utf8Len sep₂) +decreasing_by + all_goals simp_wf + · rename_i _₀ _₁ _sep₂ _₃ _₄ _₅ _₆ _₇ _₈ _r _₁₀ _₁₁ _₁₂ _₁₃ _₁₄ _ _₁₆ _₁₇ _₁₈ _ _ _ _ + clear _₀ _₁ _₃ _₄ _₅ _₆ _₇ _₈ _₁₀ _₁₁ _₁₂ _₁₃ _₁₄ _₁₆ _₁₇ _₁₈ + subst _r _sep₂ c₂ sep₂ + left + rw [utf8Len_cons] + calc + utf8Len r < utf8Len r + csize cᵣ := Pos.lt_addChar .. + _ ≤ utf8Len sep₁ + (utf8Len r + csize cᵣ) := Nat.le_add_left .. + · rename_i _₀ _₁ _sep₂ _₃ _₄ _₅ _₆ _₇ _₈ _r _₁₀ _₁₁ _₁₂ _₁₃ _₁₄ _ _₁₆ _₁₇ _₁₈ _ _ _ + clear _₀ _₁ _₃ _₄ _₅ _₆ _₇ _₈ _₁₀ _₁₁ _₁₂ _₁₃ _₁₄ _₁₆ _₁₇ _₁₈ + subst _r _sep₂ c₂ + repeat rw [utf8Len_cons] + rw [Nat.add_right_comm, Nat.add_assoc] + right + apply Pos.lt_addChar + · rename_i _₀ _₁ _sep₂ _₃ _₄ _₅ _₆ _₇ _₈ _r _₁₀ _₁₁ _₁₂ _₁₃ _₁₄ _ _₁₆ _₁₇ _ _ + clear _₀ _₁ _₃ _₄ _₅ _₆ _₇ _₈ _₁₀ _₁₁ _₁₂ _₁₃ _₁₄ _₁₆ _₁₇ + subst _r _sep₂ + left + conv => rhs; rw [← utf8Len_append, ← List.head_cons_tail (sep₁++cᵣ::r) (by simp), utf8Len_cons] + apply Pos.lt_addChar + +theorem splitOn_of_valid (s sep) : splitOn' s sep = (List.splitOnList s.1 sep.1).map mk := by + simp only [splitOn', beq_iff_eq] + split <;> rename_i hsp + · simp [hsp, List.splitOnList] + · have aux := by simpa only [List.append_nil, List.nil_append, utf8Len_nil, Pos.mk_zero, + List.reverse_nil, ← Function.id_def, List.modifyHead_id] using + splitOnAux_of_valid [] sep.1 [] [] s.1 [] (hsp ∘ ext) + rw [aux]; clear aux + apply congrArg + simp [List.splitOnListAux_eq_splitOnList_append_append] @[simp] theorem toString_toSubstring (s : String) : s.toSubstring.toString = s := extract_zero_endPos _