Skip to content

Commit

Permalink
feat: gitpathspec module for handling Git's pathspecs
Browse files Browse the repository at this point in the history
Besides the basic type provided here, the key feature is the translation
of pathspecs into the scope of a subdirectory. This is the foundational
support for implementations that focus on submodule-recursion combined
with pathspecs. Git does not generally provide this support in its
commands.
  • Loading branch information
mih committed Oct 29, 2024
1 parent 40543a4 commit 744a925
Show file tree
Hide file tree
Showing 6 changed files with 934 additions and 1 deletion.
17 changes: 17 additions & 0 deletions datasalad/gitpathspec/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Handling of Git's pathspecs with subdirectory mangling support
This functionality can be used to add support for pathspecs to implementations
that rely on Git commands that do not support submodule recursion directly.
.. currentmodule:: datasalad.gitpathspec
.. autosummary::
:toctree: generated
GitPathSpec
GitPathSpecs
"""

__all__ = ['GitPathSpec', 'GitPathSpecs']

from .pathspec import GitPathSpec
from .pathspecs import GitPathSpecs
327 changes: 327 additions & 0 deletions datasalad/gitpathspec/pathspec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,327 @@
#
# Intentionally written without importing datalad code
#
from __future__ import annotations

import posixpath
from dataclasses import dataclass
from fnmatch import fnmatch
from typing import Generator


@dataclass(frozen=True)
class GitPathSpec:
"""Support class for patterns used to limit paths in Git commands
From the Git documentation:
Pathspecs are used on the command line of "git ls-files", "git ls-tree",
"git add", "git grep", "git diff", "git checkout", and many other
commands to limit the scope of operations to some subset of the tree
or working tree.
Apart from providing a dedicated type for a pathspec, the main purpose
of this functionality is to take a pathspec that is valid in the context
of one (top-level) repository, and translate it such that the set of
pathspecs given to the same command running on/in a submodule/subdirectory
gives the same results, as if the initial top-level invocation reported
them (if it even could). See the ``for_subdir()`` method for more.
>>> # simple stripping of leading directory
>>> ps = GitPathSpec.from_pathspec_str('dir/*.jpg')
>>> [str(i) for i in ps.for_subdir('dir')]
['*.jpg']
>>> # match against magic pathspecs
>>> ps = GitPathSpec.from_pathspec_str(':(glob)**r/*.jpg')
>>> # longest and shortest match are produced
>>> [str(i) for i in ps.for_subdir('dir')]
[':(glob)**r/*.jpg', ':(glob)*.jpg']
>>> [str(i) for i in ps.for_subdir('root/some/dir')]
[':(glob)**r/*.jpg', ':(glob)*.jpg']
>>> # support for special 'no-pathspec' pathspec
>>> ps = GitPathSpec.from_pathspec_str(':')
>>> ps.is_nopathspecs
True
.. seealso::
- Entry in the Git glossary:
https://git-scm.com/docs/gitglossary#Documentation/gitglossary.txt-aiddefpathspecapathspec
- Informative, more elaborate description of pathspecs:
https://css-tricks.com/git-pathspecs-and-how-to-use-them/
"""

# TODO: think about adding support for another magic that represents
# the root of a repository hierarchy (amending 'top', which is
# the root of the working tree -- but presumably for a single repository
spectypes: tuple[str, ...]
"""Long-form pathspec type identifiers"""
dirprefix: str | None
"""Directory prefix (pathspec up to the last slash) limiting the scope"""
pattern: str | None
"""Pattern to match paths against using ``fnmatch``"""

@property
def is_nopathspecs(self) -> bool:
"""Whether this pathspec is the "no pathspecs" pathspec, AKA ``':'``"""
return not self.spectypes and not self.dirprefix and not self.pattern

def __str__(self) -> str:
"""Generate normalized (long-form) pathspec"""
if self.is_nopathspecs:
return ':'
ps = ''
if self.spectypes:
ps += ':('
ps += ','.join(self.spectypes)
ps += ')'
ps += self.get_joined_pattern()
return ps

def get_joined_pattern(self):
return (
f'{self.dirprefix if self.dirprefix else ""}'
f'{"/" if self.dirprefix else ""}'
f'{self.pattern if self.pattern else ""}'
)

def for_subdir(self, subdir: str) -> list[GitPathSpec]:
"""Translate a pathspec into the scope of a subdirectory.
The processing implemented here is purely lexical. This means that it
works without matching against actual file system (or Git tree)
content. Consequently, to some degree, overly broad results are
produced, but at the same time use cases are supported where there
is nothing (yet) to match against (e.g., a not-yet-cloned submodule).
A pathspec with a ``top`` magic is produced unmodified, as there are
defined relative to the root of a repository, not relative to a base
directory. As a consequence, such pathspecs will automatically
refer to a submodule root when the target directory is contained in
one.
Parameters
----------
subdir: str
Relative path in POSIX notation
Returns
-------
list
When an empty list is returned, this indicates that the pathsspec
cannot be translated to the given ``subdir``, because it does
not match the ``subdir`` itself. If a pathspec translates to
"no pathspecs" (``':'``), a list with a dedicated ':' pathspec is
returned.
"""
# special case of a non-translation (pretty much only here to
# make some test implementations simpler
if not subdir:
return [self]

return list(yield_subdir_match_remainder_pathspecs(subdir, self))

@classmethod
def from_pathspec_str(
cls,
pathspec: str,
) -> GitPathSpec:
"""Parse a string-form pathspec into types, prefix, and pattern"""
spectypes: list[str] = []
dirprefix = None
pattern = None

if pathspec == ':':
# shortcut for the special no-path-spec pathspec
return GitPathSpec((), '', None)

if pathspec.startswith(':('):
# long-form magic
magic, pattern = pathspec[2:].split(')', maxsplit=1)
spectypes = magic.split(',')
elif pathspec.startswith(':'):
# short-form magic
spectypes, pattern = _pathspec_from_shortform(pathspec)
else:
pattern = pathspec

# raise when glob and literal magic markers are present
# simultaneously
if 'glob' in spectypes and 'literal' in spectypes:
msg = "'glob' magic is incompatible with 'literal' magic"
raise ValueError(msg)

# split off dirprefix
dirprefix, pattern = _split_prefix_pattern(pattern)

return cls(
spectypes=tuple(spectypes),
dirprefix=dirprefix,
pattern=pattern,
)


def _pathspec_from_shortform(spec: str) -> tuple[list[str], str]:
# short-form magic
magic_signatures = {
'/': 'top',
'!': 'exclude',
'^': 'exclude',
':': None,
}
pattern = spec[1:]
spectypes: list[str] = []
for i in range(1, len(spec)):
sig = magic_signatures.get(spec[i])
if sig is None:
return (spectypes, spec[i:])
spectypes.append(sig)
return (spectypes, pattern)


def _split_prefix_pattern(pathspec: str) -> tuple[str | None, str | None]:
# > the pathspec up to the last slash represents a directory prefix.
# > The scope of that pathspec is limited to that subtree.
pattern: str | None = None
try:
last_slash_idx = pathspec[::-1].index('/')
except ValueError:
# everything is the pattern
dirprefix = None
pattern = pathspec
else:
dirprefix = pathspec[: -last_slash_idx - 1]
pattern = pathspec[-last_slash_idx:] if last_slash_idx > 0 else None
return dirprefix, pattern


def yield_subdir_match_remainder_pathspecs(
subdir: str,
pathspec: GitPathSpec,
) -> Generator[GitPathSpec, None, None]:
"""Translate a pathspec into a set of possible subdirectory pathspecs
The processing implemented here is purely lexical. This means that it
works without matching against actual file system (or Git tree) content.
This means that it yields, to some degree, overly broad results, but also
that it works in cases where there is nothing (yet) to match against.
For example, a not-yet-cloned submodule.
This function does not perform any validatity checking of pathspecs. Only
valid pathspecs and well-formed paths are supported.
A pathspec with the ``top`` magic is returned immediately and as-is. These
pathspecs have an absolute reference and do not require a translation into
a subdirectory namespace.
Parameters
----------
subdir: str
POSIX-notation relative path of a subdirectory. The reference directory
match be the same as that of the pathspec to be translated.
pathspec: GitPathSpec
To-be-translated pathspec
Yields
------
GitPathSpec
Any number of pathspecs that an input pathspec decomposed into upon
translation into the namespace of a subdirectory.
"""
if 'top' in pathspec.spectypes or pathspec.is_nopathspecs:
# pathspec with an absolute reference, or "no pathspecs"
# no translation needed
yield pathspec
return

# add a trailing directory separator to prevent undesired
# matches of partial directory names
subdir = subdir if subdir.endswith('/') else f'{subdir}/'
tp = pathspec.get_joined_pattern()

if 'icase' in pathspec.spectypes:
subdir = subdir.casefold()
tp = tp.casefold()

# literal pathspecs
if 'literal' in pathspec.spectypes:
# append a trailing slash to allow for full matches
tp_endslash = f'{tp}/'
if not tp_endslash.startswith(subdir):
# no match
# BUT
# we might have a multi-level subdir, and we might match an
# intermediate subdir and could still yield a 'no pathspec'
# result
while subdir := posixpath.split(subdir)[0]:
if tp_endslash.startswith(subdir):
yield GitPathSpec.from_pathspec_str(':')
return
return

remainder = tp[len(subdir) :]
if not remainder:
# full match
yield GitPathSpec.from_pathspec_str(':')
else:
yield GitPathSpec(pathspec.spectypes, *_split_prefix_pattern(remainder))
return

# tokenize the testpattern using the wildcard that also matches
# directories
token_delim = '**' if 'glob' in pathspec.spectypes else '*'
tp_chunks = tp.split(token_delim)
prefix_match = ''
yielded = set()
for i, chunk in enumerate(tp_chunks):
last_chunk = i + 1 == len(tp_chunks)
if last_chunk:
trymatch = f'{prefix_match}{chunk}{"" if chunk.endswith("/") else "/"}'
else:
trymatch = f'{prefix_match}{chunk}*'
if not fnmatch(subdir, f'{trymatch}'):
# each chunk needs match in order, first non-match ends the
# algorithm
# BUT
# we have an (initial) chunk that points already
# inside the target subdir
submatch = trymatch
while submatch := posixpath.split(submatch)[0]:
if not fnmatch(f'{subdir}', f'{submatch}/'):
continue
ps = GitPathSpec(
pathspec.spectypes,
*_split_prefix_pattern(
# +1 for trailing slash
tp[len(submatch) + 1 :]
),
)
if ps not in yielded:
yield ps
return
# OR
# we might have a multi-level subdir, and we might match an
# intermediate subdir and could still yield a 'no pathspec'
# result
while subdir := posixpath.split(subdir)[0]:
if fnmatch(f'{subdir}/', trymatch):
yield GitPathSpec.from_pathspec_str(':')
return
return

remainder = tp_chunks[i + 1 :]
if all(not c for c in remainder):
# direct hit, no pathspecs after translation
yield GitPathSpec.from_pathspec_str(':')
return
else:
ps = GitPathSpec(
pathspec.spectypes,
*_split_prefix_pattern(
f'{token_delim}{token_delim.join(remainder)}',
),
)
yield ps
yielded.add(ps)
# extend prefix for the next round
prefix_match = trymatch
Loading

0 comments on commit 744a925

Please sign in to comment.