-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat:
gitpathspec
module for handling Git's pathspecs
Besides the basic type provided here, the key feature is the translation of pathspecs into the scope of a subdirectory. This is the foundational support for implementations that focus on submodule-recursion combined with pathspecs. Git does not generally provide this support in its commands.
- Loading branch information
Showing
6 changed files
with
928 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
"""Handling of Git's pathspecs with subdirectory mangling support | ||
This functionality can be used to add support for pathspecs to implementations | ||
that rely on Git commands that do not support submodule recursion directly. | ||
.. currentmodule:: datasalad.gitpathspec | ||
.. autosummary:: | ||
:toctree: generated | ||
GitPathSpec | ||
GitPathSpecs | ||
""" | ||
|
||
__all__ = ['GitPathSpec', 'GitPathSpecs'] | ||
|
||
from .pathspec import GitPathSpec | ||
from .pathspecs import GitPathSpecs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,321 @@ | ||
# | ||
# Intentionally written without importing datalad code | ||
# | ||
from __future__ import annotations | ||
|
||
import posixpath | ||
from dataclasses import dataclass | ||
from fnmatch import fnmatch | ||
from typing import Generator | ||
|
||
|
||
@dataclass(frozen=True) | ||
class GitPathSpec: | ||
"""Support class for patterns used to limit paths in Git commands | ||
From the Git documentation: | ||
Pathspecs are used on the command line of "git ls-files", "git ls-tree", | ||
"git add", "git grep", "git diff", "git checkout", and many other | ||
commands to limit the scope of operations to some subset of the tree | ||
or working tree. | ||
Apart from providing a dedicated type for a pathspec, the main purpose | ||
of this functionality is to take a pathspec that is valid in the context | ||
of one (top-level) repository, and translate it such that the set of | ||
pathspecs given to the same command running on/in a submodule/subdirectory | ||
gives the same results, as if the initial top-level invocation reported | ||
them (if it even could). See the ``for_subdir()`` method for more. | ||
>>> # simple stripping of leading directory | ||
>>> ps = GitPathSpec.from_pathspec_str('dir/*.jpg') | ||
>>> [str(i) for i in ps.for_subdir('dir')] | ||
['*.jpg'] | ||
>>> # match against magic pathspecs | ||
>>> ps = GitPathSpec.from_pathspec_str(':(glob)**r/*.jpg') | ||
>>> # longest and shortest match are produced | ||
>>> [str(i) for i in ps.for_subdir('dir')] | ||
[':(glob)**r/*.jpg', ':(glob)*.jpg'] | ||
>>> [str(i) for i in ps.for_subdir('root/some/dir')] | ||
[':(glob)**r/*.jpg', ':(glob)*.jpg'] | ||
>>> # support for special 'no-pathspec' pathspec | ||
>>> ps = GitPathSpec.from_pathspec_str(':') | ||
>>> ps.is_nopathspecs | ||
True | ||
.. seealso:: | ||
- Entry in the Git glossary: | ||
https://git-scm.com/docs/gitglossary#Documentation/gitglossary.txt-aiddefpathspecapathspec | ||
- Informative, more elaborate description of pathspecs: | ||
https://css-tricks.com/git-pathspecs-and-how-to-use-them/ | ||
""" | ||
|
||
# TODO: think about adding support for another magic that represents | ||
# the root of a repository hierarchy (amending 'top', which is | ||
# the root of the working tree -- but presumably for a single repository | ||
spectypes: tuple[str, ...] | ||
"""Long-form pathspec type identifiers""" | ||
dirprefix: str | ||
"""Directory prefix (pathspec up to the last slash) limiting the scope""" | ||
pattern: str | None | ||
"""Pattern to match paths against using ``fnmatch``""" | ||
|
||
@property | ||
def is_nopathspecs(self) -> bool: | ||
"""Whether this pathspec is the "no pathspecs" pathspec, AKA ``':'``""" | ||
return not self.spectypes and not self.dirprefix and not self.pattern | ||
|
||
def __str__(self) -> str: | ||
"""Generate normalized (long-form) pathspec""" | ||
if self.is_nopathspecs: | ||
return ':' | ||
ps = '' | ||
if self.spectypes: | ||
ps += ':(' | ||
ps += ','.join(self.spectypes) | ||
ps += ')' | ||
ps += self.get_joined_pattern() | ||
return ps | ||
|
||
def get_joined_pattern(self): | ||
return ( | ||
f'{self.dirprefix if self.dirprefix else ""}' | ||
f'{"/" if self.dirprefix else ""}' | ||
f'{self.pattern if self.pattern else ""}' | ||
) | ||
|
||
def for_subdir(self, subdir: str) -> list[GitPathSpec]: | ||
"""Translate a pathspec into the scope of a subdirectory. | ||
The processing implemented here is purely lexical. This means that it | ||
works without matching against actual file system (or Git tree) | ||
content. Consequently, to some degree, overly broad results are | ||
produced, but at the same time use cases are supported where there | ||
is nothing (yet) to match against (e.g., a not-yet-cloned submodule). | ||
A pathspec with a ``top`` magic is produced unmodified, as there are | ||
defined relative to the root of a repository, not relative to a base | ||
directory. As a consequence, such pathspecs will automatically | ||
refer to a submodule root when the target directory is contained in | ||
one. | ||
Parameters | ||
---------- | ||
subdir: str | ||
Relative path in POSIX notation | ||
Returns | ||
------- | ||
list | ||
When an empty list is returned, this indicates that the pathsspec | ||
cannot be translated to the given ``subdir``, because it does | ||
not match the ``subdir`` itself. If a pathspec translates to | ||
"no pathspecs" (``':'``), a list with a dedicated ':' pathspec is | ||
returned. | ||
""" | ||
# special case of a non-translation (pretty much only here to | ||
# make some test implementations simpler | ||
if not subdir: | ||
return [self] | ||
|
||
return list(yield_subdir_match_remainder_pathspecs(subdir, self)) | ||
|
||
@classmethod | ||
def from_pathspec_str( | ||
cls, | ||
pathspec: str, | ||
) -> GitPathSpec: | ||
"""Parse a string-form pathspec into types, prefix, and pattern""" | ||
spectypes = [] | ||
dirprefix = None | ||
pattern = None | ||
|
||
if pathspec == ':': | ||
# shortcut for the special no-path-spec pathspec | ||
return GitPathSpec((), '', None) | ||
|
||
if pathspec.startswith(':('): | ||
# long-form magic | ||
magic, pattern = pathspec[2:].split(')', maxsplit=1) | ||
spectypes = magic.split(',') | ||
elif pathspec.startswith(':'): | ||
# short-form magic | ||
magic_signatures = { | ||
'/': 'top', | ||
'!': 'exclude', | ||
'^': 'exclude', | ||
':': None, | ||
} | ||
pattern = pathspec[1:] | ||
spectypes = [] | ||
for i in range(1, len(pathspec)): | ||
sig = magic_signatures.get(pathspec[i]) | ||
if sig is None: | ||
pattern = pathspec[i:] | ||
break | ||
spectypes.append(sig) | ||
else: | ||
pattern = pathspec | ||
|
||
# raise when glob and literal magic markers are present | ||
# simultaneously | ||
if 'glob' in spectypes and 'literal' in spectypes: | ||
msg = "'glob' magic is incompatible with 'literal' magic" | ||
raise ValueError(msg) | ||
|
||
# split off dirprefix | ||
dirprefix, pattern = _split_prefix_pattern(pattern) | ||
|
||
return cls( | ||
spectypes=tuple(spectypes), | ||
dirprefix=dirprefix, | ||
pattern=pattern, | ||
) | ||
|
||
|
||
def _split_prefix_pattern(pathspec): | ||
# > the pathspec up to the last slash represents a directory prefix. | ||
# > The scope of that pathspec is limited to that subtree. | ||
try: | ||
last_slash_idx = pathspec[::-1].index('/') | ||
except ValueError: | ||
# everything is the pattern | ||
dirprefix = None | ||
pattern = pathspec | ||
else: | ||
dirprefix = pathspec[: -last_slash_idx - 1] | ||
pattern = pathspec[-last_slash_idx:] if last_slash_idx > 0 else None | ||
return dirprefix, pattern | ||
|
||
|
||
def yield_subdir_match_remainder_pathspecs( | ||
subdir: str, | ||
pathspec: GitPathSpec, | ||
) -> Generator[GitPathSpec, None, None]: | ||
"""Translate a pathspec into a set of possible subdirectory pathspecs | ||
The processing implemented here is purely lexical. This means that it | ||
works without matching against actual file system (or Git tree) content. | ||
This means that it yields, to some degree, overly broad results, but also | ||
that it works in cases where there is nothing (yet) to match against. | ||
For example, a not-yet-cloned submodule. | ||
This function does not perform any validatity checking of pathspecs. Only | ||
valid pathspecs and well-formed paths are supported. | ||
A pathspec with the ``top`` magic is returned immediately and as-is. These | ||
pathspecs have an absolute reference and do not require a translation into | ||
a subdirectory namespace. | ||
Parameters | ||
---------- | ||
subdir: str | ||
POSIX-notation relative path of a subdirectory. The reference directory | ||
match be the same as that of the pathspec to be translated. | ||
pathspec: GitPathSpec | ||
To-be-translated pathspec | ||
Yields | ||
------ | ||
GitPathSpec | ||
Any number of pathspecs that an input pathspec decomposed into upon | ||
translation into the namespace of a subdirectory. | ||
""" | ||
if 'top' in pathspec.spectypes or pathspec.is_nopathspecs: | ||
# pathspec with an absolute reference, or "no pathspecs" | ||
# no translation needed | ||
yield pathspec | ||
return | ||
|
||
# add a trailing directory separator to prevent undesired | ||
# matches of partial directory names | ||
subdir = subdir if subdir.endswith('/') else f'{subdir}/' | ||
tp = pathspec.get_joined_pattern() | ||
|
||
if 'icase' in pathspec.spectypes: | ||
subdir = subdir.casefold() | ||
tp = tp.casefold() | ||
|
||
# literal pathspecs | ||
if 'literal' in pathspec.spectypes: | ||
# append a trailing slash to allow for full matches | ||
tp_endslash = f'{tp}/' | ||
if not tp_endslash.startswith(subdir): | ||
# no match | ||
# BUT | ||
# we might have a multi-level subdir, and we might match an | ||
# intermediate subdir and could still yield a 'no pathspec' | ||
# result | ||
while subdir := posixpath.split(subdir)[0]: | ||
if tp_endslash.startswith(subdir): | ||
yield GitPathSpec.from_pathspec_str(':') | ||
return | ||
return | ||
|
||
remainder = tp[len(subdir) :] | ||
if not remainder: | ||
# full match | ||
yield GitPathSpec.from_pathspec_str(':') | ||
else: | ||
yield GitPathSpec(pathspec.spectypes, *_split_prefix_pattern(remainder)) | ||
return | ||
|
||
# tokenize the testpattern using the wildcard that also matches | ||
# directories | ||
token_delim = '**' if 'glob' in pathspec.spectypes else '*' | ||
tp_chunks = tp.split(token_delim) | ||
prefix_match = '' | ||
yielded = set() | ||
for i, chunk in enumerate(tp_chunks): | ||
last_chunk = i + 1 == len(tp_chunks) | ||
if last_chunk: | ||
trymatch = f'{prefix_match}{chunk}{"" if chunk.endswith("/") else "/"}' | ||
else: | ||
trymatch = f'{prefix_match}{chunk}*' | ||
if not fnmatch(subdir, f'{trymatch}'): | ||
# each chunk needs match in order, first non-match ends the | ||
# algorithm | ||
# BUT | ||
# we have an (initial) chunk that points already | ||
# inside the target subdir | ||
submatch = trymatch | ||
while submatch := posixpath.split(submatch)[0]: | ||
if not fnmatch(f'{subdir}', f'{submatch}/'): | ||
continue | ||
ps = GitPathSpec( | ||
pathspec.spectypes, | ||
*_split_prefix_pattern( | ||
# +1 for trailing slash | ||
tp[len(submatch) + 1 :] | ||
), | ||
) | ||
if ps not in yielded: | ||
yield ps | ||
return | ||
# OR | ||
# we might have a multi-level subdir, and we might match an | ||
# intermediate subdir and could still yield a 'no pathspec' | ||
# result | ||
while subdir := posixpath.split(subdir)[0]: | ||
if fnmatch(f'{subdir}/', trymatch): | ||
yield GitPathSpec.from_pathspec_str(':') | ||
return | ||
return | ||
|
||
remainder = tp_chunks[i + 1 :] | ||
if all(not c for c in remainder): | ||
# direct hit, no pathspecs after translation | ||
yield GitPathSpec.from_pathspec_str(':') | ||
return | ||
else: | ||
ps = GitPathSpec( | ||
pathspec.spectypes, | ||
*_split_prefix_pattern( | ||
f'{token_delim}{token_delim.join(remainder)}', | ||
), | ||
) | ||
yield ps | ||
yielded.add(ps) | ||
# extend prefix for the next round | ||
prefix_match = trymatch |
Oops, something went wrong.