Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding appropriate error when inputing URL's in add_files function (instead of using add_external_files) #1333

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 65 additions & 48 deletions clearml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,58 +380,75 @@ def add_tags(self, tags):
:param tags: A list of tags which describe the Task to add.
"""
self._task.add_tags(tags)
import re
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please move to the top of the file, and make sure to sort the imports


def add_files(
self,
path, # type: Union[str, Path, _Path]
wildcard=None, # type: Optional[Union[str, Sequence[str]]]
local_base_folder=None, # type: Optional[str]
dataset_path=None, # type: Optional[str]
recursive=True, # type: bool
verbose=False, # type: bool
max_workers=None, # type: Optional[int]
):
# type: (...) -> ()
"""
Add a folder into the current dataset. calculate file hash,
and compare against parent, mark files to be uploaded

:param path: Add a folder/file to the dataset
:param wildcard: add only specific set of files.
Wildcard matching, can be a single string or a list of wildcards.
:param local_base_folder: files will be located based on their relative path from local_base_folder
:param dataset_path: where in the dataset the folder/files should be located
:param recursive: If True, match all wildcard files recursively
:param verbose: If True, print to console files added/modified
:param max_workers: The number of threads to add the files with. Defaults to the number of logical cores
:return: number of files added
"""
max_workers = max_workers or psutil.cpu_count()
self._dirty = True
self._task.get_logger().report_text(
'Adding files to dataset: {}'.format(
dict(path=path, wildcard=wildcard, local_base_folder=local_base_folder,
dataset_path=dataset_path, recursive=recursive, verbose=verbose)),
print_console=False)
def is_url(path):
"""
Helper function to check if the provided path is an external URL (e.g., s3://, http://).
"""
url_regex = re.compile(
r'^(?:http|ftp|s3|gs|azure)://' # schemes: http, ftp, s3, gs, azure
)
return url_regex.match(path) is not None

num_added, num_modified = self._add_files(
path=path,
wildcard=wildcard,
local_base_folder=local_base_folder,
dataset_path=dataset_path,
recursive=recursive,
verbose=verbose,
max_workers=max_workers,
def add_files(
self,
path, # type: Union[str, Path, _Path]
wildcard=None, # type: Optional[Union[str, Sequence[str]]]
local_base_folder=None, # type: Optional[str]
dataset_path=None, # type: Optional[str]
recursive=True, # type: bool
verbose=False, # type: bool
max_workers=None, # type: Optional[int]
):
# type: (...) -> ()
"""
Add a folder into the current dataset. calculate file hash,
and compare against parent, mark files to be uploaded

:param path: Add a folder/file to the dataset
:param wildcard: add only specific set of files.
Wildcard matching, can be a single string or a list of wildcards.
:param local_base_folder: files will be located based on their relative path from local_base_folder
:param dataset_path: where in the dataset the folder/files should be located
:param recursive: If True, match all wildcard files recursively
:param verbose: If True, print to console files added/modified
:param max_workers: The number of threads to add the files with. Defaults to the number of logical cores
:return: number of files added
"""
# Check if the path provided is a URL, if so, raise an error and suggest using add_external_files
if is_url(path):
raise ValueError(
"The path provided seems to be an external URL (e.g., s3://, http://). "
"Please use `add_external_files()` to add external files to the dataset."
)

# update the task script
self._add_script_call(
'add_files', path=path, wildcard=wildcard, local_base_folder=local_base_folder,
dataset_path=dataset_path, recursive=recursive)

self._serialize()

return num_added
max_workers = max_workers or psutil.cpu_count()
self._dirty = True
self._task.get_logger().report_text(
'Adding files to dataset: {}'.format(
dict(path=path, wildcard=wildcard, local_base_folder=local_base_folder,
dataset_path=dataset_path, recursive=recursive, verbose=verbose)),
print_console=False)

num_added, num_modified = self._add_files(
path=path,
wildcard=wildcard,
local_base_folder=local_base_folder,
dataset_path=dataset_path,
recursive=recursive,
verbose=verbose,
max_workers=max_workers,
)

# update the task script
self._add_script_call(
'add_files', path=path, wildcard=wildcard, local_base_folder=local_base_folder,
dataset_path=dataset_path, recursive=recursive)

self._serialize()

return num_added

def add_external_files(
self,
Expand Down