feat: add String.splitOn_of_valid

Add `String.splitOnAux_of_valid` and `String.splitOn_of_valid`.
leanprover-community · Apr 19, 2024 · e6b2836 · e6b2836
1 parent 770836f
commit e6b2836
Showing 1 changed file with 153 additions and 4 deletions.
diff --git a/Std/Data/String/Lemmas.lean b/Std/Data/String/Lemmas.lean
@@ -4,10 +4,9 @@ Released under Apache 2.0 license as described in the file LICENSE.
 Authors: Bulhwi Cha, Mario Carneiro
 -/
 import Std.Data.Char
-import Std.Data.Nat.Lemmas
-import Std.Data.List.Lemmas
+import Std.Data.List.SplitOnList
 import Std.Data.String.Basic
-import Std.Tactic.Lint.Misc
+import Std.Logic
 import Std.Tactic.SeqFocus
 
 @[simp] theorem Char.length_toString (c : Char) : c.toString.length = 1 := rfl
@@ -79,6 +78,9 @@ private theorem ne_self_add_add_csize : i ≠ i + (n + csize c) :=
 @[simp] theorem utf8Len_eq_zero : utf8Len l = 0 ↔ l = [] := by
   cases l <;> simp [Nat.ne_of_gt add_csize_pos]
 
+theorem utf8Len_pos : 0 < utf8Len l ↔ l ≠ [] := by
+  cases l <;> simp [add_csize_pos]
+
 section
 open List
 theorem utf8Len_le_of_sublist : ∀ {cs₁ cs₂}, cs₁ <+ cs₂ → utf8Len cs₁ ≤ utf8Len cs₂
@@ -484,7 +486,154 @@ theorem splitAux_of_valid (p l m r acc) :
 theorem split_of_valid (s p) : split s p = (List.splitOnP p s.1).map mk := by
   simpa [split] using splitAux_of_valid p [] [] s.1 []
 
--- TODO: splitOn
+/-
+NOTE: When the latest Lean toolchain has the fixed versions of `splitOnAux` and `splitOn`, remove
+this section and replace `splitOnAux'` and `splitOn'` with `splitOnAux` and `splitOn`, respectively.
+See https://github.com/leanprover/lean4/pull/3832.
+-/
+section DELETE_THIS_LATER
+
+/--
+Auxiliary for `splitOn`. Preconditions:
+* `sep` is not empty
+* `b <= i` are indexes into `s`
+* `j` is an index into `sep`, and not at the end
+
+It represents the state where we have currently parsed some split parts into `r` (in reverse order),
+`b` is the beginning of the string / the end of the previous match of `sep`, and the first `j` bytes
+of `sep` match the bytes `i-j .. i` of `s`.
+-/
+def splitOnAux' (s sep : String) (b : Pos) (i : Pos) (j : Pos) (r : List String) : List String :=
+  if s.atEnd i then
+    let r := (s.extract b i)::r
+    r.reverse
+  else
+    if s.get i == sep.get j then
+      let i := s.next i
+      let j := sep.next j
+      if sep.atEnd j then
+        splitOnAux' s sep i i 0 (s.extract b (i - j)::r)
+      else
+        splitOnAux' s sep b i j r
+    else
+      splitOnAux' s sep b (s.next (i - j)) 0 r
+termination_by (s.endPos.1 - (i - j).1, sep.endPos.1 - j.1)
+decreasing_by
+  all_goals simp_wf
+  focus
+    rename_i h _ _
+    left; exact Nat.sub_lt_sub_left
+      (Nat.lt_of_le_of_lt (Nat.sub_le ..) (Nat.gt_of_not_le (mt decide_eq_true h)))
+      (Nat.lt_of_le_of_lt (Nat.sub_le ..) (lt_next s _))
+  focus
+    rename_i i₀ j₀ _ eq h'
+    repeat rw [← Pos.sub_byteIdx]
+    rw [show (s.next i₀ - sep.next j₀).1 = (i₀ - j₀).1 by
+      show (_ + csize _) - (_ + csize _) = _
+      rw [(beq_iff_eq ..).1 eq, Nat.add_sub_add_right]; rfl]
+    right; exact Nat.sub_lt_sub_left
+      (Nat.lt_of_le_of_lt (Nat.le_add_right ..) (Nat.gt_of_not_le (mt decide_eq_true h')))
+      (lt_next sep _)
+  focus
+    rename_i h _
+    left; exact Nat.sub_lt_sub_left
+      (Nat.lt_of_le_of_lt (Nat.sub_le ..) (Nat.gt_of_not_le (mt decide_eq_true h)))
+      (lt_next s _)
+
+/--
+Splits a string `s` on occurrences of the separator `sep`. When `sep` is empty, it returns `[s]`;
+when `sep` occurs in overlapping patterns, the first match is taken. There will always be exactly
+`n+1` elements in the returned list if there were `n` nonoverlapping matches of `sep` in the string.
+The default separator is `" "`. The separators are not included in the returned substrings.
+
+```
+"here is some text ".splitOn = ["here", "is", "some", "text", ""]
+"here is some text ".splitOn "some" = ["here is ", " text "]
+"here is some text ".splitOn "" = ["here is some text "]
+"ababacabac".splitOn "aba" = ["", "bac", "c"]
+```
+-/
+def splitOn' (s : String) (sep : String := " ") : List String :=
+  if sep == "" then [s] else splitOnAux' s sep 0 0 0 []
+
+end DELETE_THIS_LATER
+
+theorem splitOnAux_of_valid (sep₁ sep₂ l m r acc) (h : sep₂ ≠ []) :
+    splitOnAux' ⟨l ++ m ++ sep₁ ++ r⟩ ⟨sep₁ ++ sep₂⟩ ⟨utf8Len l⟩ ⟨utf8Len (l ++ m ++ sep₁)⟩
+      ⟨utf8Len sep₁⟩ acc =
+    acc.reverse ++ (List.modifyHead (m ++ ·) <| List.splitOnListAux r sep₁ sep₂ h).map mk := by
+  obtain ⟨c₂, sep₂, rfl⟩ := sep₂.exists_cons_of_ne_nil h
+  rw [splitOnAux', List.splitOnListAux]
+  simp only [List.append_assoc, utf8Len_append, atEnd_iff, endPos_eq, Pos.mk_le_mk,
+    Nat.add_le_add_iff_left, Nat.add_right_le_self, utf8Len_eq_zero, by simpa only
+    [List.append_assoc, utf8Len_append] using (⟨get_of_valid (l++m++sep₁) r, next_of_valid
+    (l++m++sep₁) (r.headD default) r, extract_of_valid l (m++sep₁) r⟩ : _∧_∧_),  List.reverse_cons,
+    get_of_valid, List.headD_cons, beq_iff_eq, next, Pos.addChar_eq, utf8Len_cons,
+    Nat.add_left_le_self, Pos.sub_eq, dite_eq_ite]
+  split
+  · simp
+  · next hls =>
+    obtain ⟨cᵣ, r, rfl⟩ := r.exists_cons_of_ne_nil hls
+    simp only [List.headD_cons, List.head_cons, List.tail_cons, false_or]
+    split
+    · next hc =>
+      subst c₂
+      rw [← Nat.add_assoc, Nat.add_assoc, Nat.add_sub_cancel, Nat.add_assoc]
+      split
+      · subst sep₂
+        have ih := by simpa only [List.append_assoc, List.append_nil, List.singleton_append,
+          List.nil_append, utf8Len_append, utf8Len_cons, utf8Len_nil, Nat.zero_add, Pos.mk_zero,
+          List.reverse_cons, ← Function.id_def, List.modifyHead_id] using (splitOnAux_of_valid []
+          (sep₁++[cᵣ]) (l++m++sep₁++[cᵣ]) [] r (⟨m⟩::acc) (by simp))
+        simp [by simpa using extract_of_valid l m (sep₁++cᵣ::r), ih]
+      · next hsp₂ =>
+        simpa using splitOnAux_of_valid (sep₁++[cᵣ]) sep₂ l m r acc hsp₂
+    · next hc =>
+      have hget := by simpa only [List.append_assoc, utf8Len_append]
+        using get_of_valid (l++m) (sep₁++cᵣ::r)
+      have ih := by simpa only [List.append_assoc, List.append_nil, List.singleton_append,
+        List.nil_append, utf8Len_append, utf8Len_cons, utf8Len_nil, Nat.zero_add, Pos.mk_zero,
+        List.head_cons_tail (sep₁++cᵣ::r) (by simp)] using (splitOnAux_of_valid [] (sep₁++c₂::sep₂)
+        l (m++[(sep₁++cᵣ::r).head (by simp)]) (sep₁++cᵣ::r).tail acc (by simp))
+      rw [Nat.add_sub_assoc, Nat.add_sub_cancel, hget, List.headD_eq_head (sep₁++cᵣ::r) (by simp),
+        Nat.add_assoc, ih, List.append_cancel_left_eq, List.modifyHead_modifyHead,
+        Function.comp_def]; clear hget ih
+      apply Nat.le_add_left
+termination_by (utf8Len sep₁ + utf8Len r, utf8Len sep₂)
+decreasing_by
+  all_goals simp_wf
+  · rename_i _₀ _₁ _sep₂ _₃ _₄ _₅ _₆ _₇ _₈ _r _₁₀ _₁₁ _₁₂ _₁₃ _₁₄ _ _₁₆ _₁₇ _₁₈ _ _ _ _
+    clear _₀ _₁ _₃ _₄ _₅ _₆ _₇ _₈ _₁₀ _₁₁ _₁₂ _₁₃ _₁₄ _₁₆ _₁₇ _₁₈
+    subst _r _sep₂ c₂ sep₂
+    left
+    rw [utf8Len_cons]
+    calc
+      utf8Len r < utf8Len r + csize cᵣ := Pos.lt_addChar ..
+      _ ≤ utf8Len sep₁ + (utf8Len r + csize cᵣ) := Nat.le_add_left ..
+  · rename_i _₀ _₁ _sep₂ _₃ _₄ _₅ _₆ _₇ _₈ _r _₁₀ _₁₁ _₁₂ _₁₃ _₁₄ _ _₁₆ _₁₇ _₁₈ _ _ _
+    clear _₀ _₁ _₃ _₄ _₅ _₆ _₇ _₈ _₁₀ _₁₁ _₁₂ _₁₃ _₁₄ _₁₆ _₁₇ _₁₈
+    subst _r _sep₂ c₂
+    repeat rw [utf8Len_cons]
+    rw [Nat.add_right_comm, Nat.add_assoc]
+    right
+    apply Pos.lt_addChar
+  · rename_i _₀ _₁ _sep₂ _₃ _₄ _₅ _₆ _₇ _₈ _r _₁₀ _₁₁ _₁₂ _₁₃ _₁₄ _ _₁₆ _₁₇ _ _
+    clear _₀ _₁ _₃ _₄ _₅ _₆ _₇ _₈ _₁₀ _₁₁ _₁₂ _₁₃ _₁₄ _₁₆ _₁₇
+    subst _r _sep₂
+    left
+    conv => rhs; rw [← utf8Len_append, ← List.head_cons_tail (sep₁++cᵣ::r) (by simp), utf8Len_cons]
+    apply Pos.lt_addChar
+
+theorem splitOn_of_valid (s sep) : splitOn' s sep = (List.splitOnList s.1 sep.1).map mk := by
+  simp only [splitOn', beq_iff_eq]
+  split <;> rename_i hsp
+  · simp [hsp, List.splitOnList]
+  · have aux := by simpa only [List.append_nil, List.nil_append, utf8Len_nil, Pos.mk_zero,
+      List.reverse_nil, ← Function.id_def, List.modifyHead_id] using
+      splitOnAux_of_valid [] sep.1 [] [] s.1 [] (hsp ∘ ext)
+    rw [aux]; clear aux
+    apply congrArg
+    simp [List.splitOnListAux_eq_splitOnList_append_append]
 
 @[simp] theorem toString_toSubstring (s : String) : s.toSubstring.toString = s :=
   extract_zero_endPos _