Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions openml/_api/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from openml._api.runtime.core import APIContext


def set_api_version(version: str, *, strict: bool = False) -> None:
api_context.set_version(version=version, strict=strict)


api_context = APIContext()
62 changes: 62 additions & 0 deletions openml/_api/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import Literal

DelayMethod = Literal["human", "robot"]


@dataclass
class APIConfig:
server: str
base_url: str
key: str
timeout: int = 10 # seconds


@dataclass
class APISettings:
v1: APIConfig
v2: APIConfig


@dataclass
class ConnectionConfig:
retries: int = 3
delay_method: DelayMethod = "human"
delay_time: int = 1 # seconds

def __post_init__(self) -> None:
if self.delay_method not in ("human", "robot"):
raise ValueError(f"delay_method must be 'human' or 'robot', got {self.delay_method}")


@dataclass
class CacheConfig:
dir: str = "~/.openml/cache"
ttl: int = 60 * 60 * 24 * 7 # one week


@dataclass
class Settings:
api: APISettings
connection: ConnectionConfig
cache: CacheConfig


settings = Settings(
api=APISettings(
v1=APIConfig(
server="https://www.openml.org/",
base_url="api/v1/xml/",
key="...",
),
v2=APIConfig(
server="http://127.0.0.1:8001/",
base_url="",
key="...",
),
),
connection=ConnectionConfig(),
cache=CacheConfig(),
)
3 changes: 3 additions & 0 deletions openml/_api/http/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from openml._api.http.client import HTTPClient

__all__ = ["HTTPClient"]
302 changes: 302 additions & 0 deletions openml/_api/http/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,302 @@
from __future__ import annotations

import contextlib
import shutil
import urllib
import urllib.parse
import zipfile
from collections.abc import Callable
from pathlib import Path
from typing import TYPE_CHECKING, Any
from urllib.parse import urlencode, urljoin, urlparse

import minio
import requests
from requests import Response
from urllib3 import ProxyManager

from openml.__version__ import __version__
from openml._api.config import settings

if TYPE_CHECKING:
from openml._api.config import APIConfig

import openml.config
from openml.utils import ProgressBar


class CacheMixin:
@property
def dir(self) -> str:
return settings.cache.dir

@property
def ttl(self) -> int:
return settings.cache.ttl

def _get_cache_dir(self, url: str, params: dict[str, Any]) -> Path:
parsed_url = urlparse(url)
netloc_parts = parsed_url.netloc.split(".")[::-1] # reverse domain
path_parts = parsed_url.path.strip("/").split("/")

# remove api_key and serialize params if any
filtered_params = {k: v for k, v in params.items() if k != "api_key"}
params_part = [urlencode(filtered_params)] if filtered_params else []

return Path(self.dir).joinpath(*netloc_parts, *path_parts, *params_part)

def _get_cache_response(self, cache_dir: Path) -> Response: # noqa: ARG002
return Response()

def _set_cache_response(self, cache_dir: Path, response: Response) -> None: # noqa: ARG002
return None


class HTTPClient(CacheMixin):
def __init__(self, config: APIConfig) -> None:
self.config = config
self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"}

@property
def server(self) -> str:
return self.config.server

@property
def base_url(self) -> str:
return self.config.base_url

@property
def key(self) -> str:
return self.config.key

@property
def timeout(self) -> int:
return self.config.timeout

def request(
self,
method: str,
path: str,
*,
use_cache: bool = False,
use_api_key: bool = False,
**request_kwargs: Any,
) -> Response:
url = urljoin(self.server, urljoin(self.base_url, path))

params = request_kwargs.pop("params", {})
params = params.copy()
if use_api_key:
params["api_key"] = self.key

headers = request_kwargs.pop("headers", {})
headers = headers.copy()
headers.update(self.headers)

timeout = request_kwargs.pop("timeout", self.timeout)
cache_dir = self._get_cache_dir(url, params)

if use_cache:
try:
return self._get_cache_response(cache_dir)
# TODO: handle ttl expired error
except Exception:
raise

response = requests.request(
method=method,
url=url,
params=params,
headers=headers,
timeout=timeout,
**request_kwargs,
)

if use_cache:
self._set_cache_response(cache_dir, response)

return response

def get(
self,
path: str,
*,
use_cache: bool = False,
use_api_key: bool = False,
**request_kwargs: Any,
) -> Response:
# TODO: remove override when cache is implemented
use_cache = False
return self.request(
method="GET",
path=path,
use_cache=use_cache,
use_api_key=use_api_key,
**request_kwargs,
)

def post(
self,
path: str,
**request_kwargs: Any,
) -> Response:
return self.request(
method="POST",
path=path,
use_cache=False,
use_api_key=True,
**request_kwargs,
)

def delete(
self,
path: str,
**request_kwargs: Any,
) -> Response:
return self.request(
method="DELETE",
path=path,
use_cache=False,
use_api_key=True,
**request_kwargs,
)

def download(
self,
url: str,
handler: Callable[[Response, Path, str], Path],
encoding: str = "utf-8",
) -> Path:
response = self.get(url)
dir_path = self._get_cache_dir(url, {})
dir_path = dir_path.expanduser()
if handler is not None:
return handler(response, dir_path, encoding)

return self._text_handler(response, dir_path, encoding, url)

def _text_handler(self, response: Response, path: Path, encoding: str) -> Path:
if path.is_dir():
path = path / "response.txt"
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", encoding=encoding) as f:
f.write(response.text)
return path


class MinIOClient(CacheMixin):
def __init__(self) -> None:
self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"}

def download_minio_file(
self,
source: str,
destination: str | Path | None = None,
exists_ok: bool = True, # noqa: FBT002
proxy: str | None = "auto",
) -> str:
"""Download file ``source`` from a MinIO Bucket and store it at ``destination``.

Parameters
----------
source : str
URL to a file in a MinIO bucket.
destination : str | Path
Path to store the file to, if a directory is provided the original filename is used.
exists_ok : bool, optional (default=True)
If False, raise FileExists if a file already exists in ``destination``.
proxy: str, optional (default = "auto")
The proxy server to use. By default it's "auto" which uses ``requests`` to
automatically find the proxy to use. Pass None or the environment variable
``no_proxy="*"`` to disable proxies.
"""
destination = self._get_cache_dir(source, {}) if destination is None else Path(destination)
parsed_url = urllib.parse.urlparse(source)

# expect path format: /BUCKET/path/to/file.ext
bucket, object_name = parsed_url.path[1:].split("/", maxsplit=1)
if destination.is_dir():
destination = Path(destination, object_name)
if destination.is_file() and not exists_ok:
raise FileExistsError(f"File already exists in {destination}.")

destination = destination.expanduser()
destination.parent.mkdir(parents=True, exist_ok=True)

if proxy == "auto":
resolved_proxies = requests.utils.get_environ_proxies(parsed_url.geturl())
proxy = requests.utils.select_proxy(parsed_url.geturl(), resolved_proxies) # type: ignore

proxy_client = ProxyManager(proxy) if proxy else None

client = minio.Minio(endpoint=parsed_url.netloc, secure=False, http_client=proxy_client)
try:
client.fget_object(
bucket_name=bucket,
object_name=object_name,
file_path=str(destination),
progress=ProgressBar() if openml.config.show_progress else None,
request_headers=self.headers,
)
if destination.is_file() and destination.suffix == ".zip":
with zipfile.ZipFile(destination, "r") as zip_ref:
zip_ref.extractall(destination.parent)

except minio.error.S3Error as e:
if e.message is not None and e.message.startswith("Object does not exist"):
raise FileNotFoundError(f"Object at '{source}' does not exist.") from e
# e.g. permission error, or a bucket does not exist (which is also interpreted as a
# permission error on minio level).
raise FileNotFoundError("Bucket does not exist or is private.") from e

return str(destination)

def download_minio_bucket(self, source: str, destination: str | Path) -> None:
"""Download file ``source`` from a MinIO Bucket and store it at ``destination``.

Does not redownload files which already exist.

Parameters
----------
source : str
URL to a MinIO bucket.
destination : str | Path
Path to a directory to store the bucket content in.
"""
destination = self._get_cache_dir(source, {}) if destination is None else Path(destination)
parsed_url = urllib.parse.urlparse(source)

# expect path format: /BUCKET/path/to/file.ext
_, bucket, *prefixes, _file = parsed_url.path.split("/")
prefix = "/".join(prefixes)

client = minio.Minio(endpoint=parsed_url.netloc, secure=False)

for file_object in client.list_objects(bucket, prefix=prefix, recursive=True):
if file_object.object_name is None:
raise ValueError(f"Object name is None for object {file_object!r}")
if file_object.etag is None:
raise ValueError(f"Object etag is None for object {file_object!r}")

marker = destination / file_object.etag
if marker.exists():
continue

file_destination = destination / file_object.object_name.rsplit("/", 1)[1]
if (file_destination.parent / file_destination.stem).exists():
# Marker is missing but archive exists means the server archive changed
# force a refresh
shutil.rmtree(file_destination.parent / file_destination.stem)

with contextlib.suppress(FileExistsError):
self.download_minio_file(
source=source.rsplit("/", 1)[0]
+ "/"
+ file_object.object_name.rsplit("/", 1)[1],
destination=file_destination,
exists_ok=False,
)

if file_destination.is_file() and file_destination.suffix == ".zip":
file_destination.unlink()
marker.touch()
Empty file added openml/_api/http/utils.py
Empty file.
4 changes: 4 additions & 0 deletions openml/_api/resources/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from openml._api.resources.datasets import DatasetsV1, DatasetsV2
from openml._api.resources.tasks import TasksV1, TasksV2

__all__ = ["DatasetsV1", "DatasetsV2", "TasksV1", "TasksV2"]
Loading
Loading