Skip to content

Commit

Permalink
apply suggestion from PR review: simplify normalization func
Browse files Browse the repository at this point in the history
Co-authored-by: Helio Machado <[email protected]>
  • Loading branch information
shcheklein and 0x2b3bfa0 authored Oct 28, 2024
1 parent cd76ebe commit 7219876
Showing 1 changed file with 12 additions and 17 deletions.
29 changes: 12 additions & 17 deletions src/datachain/lib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,24 +42,19 @@ def normalize_col_names(col_names: Sequence[str]) -> dict[str, str]:
new_column = re.sub("[^0-9a-z]+", "_", new_column)
new_column = new_column.strip("_")

if (
not new_column
or new_column[0].isdigit()
or (new_column != org_column and new_column in org_col_names)
or new_column in new_col_names
generated_column = new_column

while (
not generated_column.isidentifier()
or generated_column in new_col_names
or (generated_column != org_column and generated_column in org_col_names)
):
while True:
if new_column:
generated_column = f"c{gen_col_counter}_{new_column}"
else:
generated_column = f"c{gen_col_counter}"
gen_col_counter += 1
if new_column:
generated_column = f"{generated_column}_{new_column}"
if (
generated_column not in org_col_names
and generated_column not in new_col_names
):
new_column = generated_column
break

new_col_names[new_column] = org_column
gen_col_counter += 1

new_col_names[generated_column] = org_column

return new_col_names

0 comments on commit 7219876

Please sign in to comment.