Skip to content

API Reference

DataSluice — one Python interface for open-data discovery, extraction, format normalization, and pipeline integration.

AdapterError

Bases: DataSluiceError

Raised when an adapter cannot fulfil a request.

Source code in src/datasluice/exceptions.py
class AdapterError(DataSluiceError):
    """Raised when an adapter cannot fulfil a request."""

AdapterNotFoundError

Bases: AdapterError

Raised when no adapter is registered for a portal type.

Source code in src/datasluice/exceptions.py
class AdapterNotFoundError(AdapterError):
    """Raised when no adapter is registered for a portal type."""

AuthenticationError

Bases: DataSluiceError

Raised when authentication credentials are missing or invalid.

Source code in src/datasluice/exceptions.py
class AuthenticationError(DataSluiceError):
    """Raised when authentication credentials are missing or invalid."""

ChecksumMismatchError

Bases: DownloadError

Raised when a downloaded file's checksum does not match.

Source code in src/datasluice/exceptions.py
class ChecksumMismatchError(DownloadError):
    """Raised when a downloaded file's checksum does not match."""

    def __init__(self, message: str, expected: str | None = None, actual: str | None = None) -> None:
        super().__init__(message)
        self.expected = expected
        self.actual = actual

ConfigError

Bases: DataSluiceError

Raised when configuration is invalid or incomplete.

Source code in src/datasluice/exceptions.py
class ConfigError(DataSluiceError):
    """Raised when configuration is invalid or incomplete."""

DataSluice

Unified client for open-data portals.

Parameters:

Name Type Description Default
portal_url str

Base URL of the open-data portal.

required
portal_type str | None

Optional explicit portal type (e.g. "ckan"). Auto-detected when omitted.

None
auth BaseAuth | None

Optional authentication strategy.

None
settings Settings | None

Optional pre-loaded settings. Loaded from the environment when omitted.

None
transport HttpClient | None

Optional pre-configured HTTP client.

None
Example

from datasluice import DataSluice ds = DataSluice("https://catalog.data.gov") results = ds.search("climate change") for dataset in results: ... print(dataset.title)

Source code in src/datasluice/client.py
class DataSluice:
    """Unified client for open-data portals.

    Args:
        portal_url: Base URL of the open-data portal.
        portal_type: Optional explicit portal type (e.g. ``"ckan"``).
            Auto-detected when omitted.
        auth: Optional authentication strategy.
        settings: Optional pre-loaded settings. Loaded from the environment
            when omitted.
        transport: Optional pre-configured HTTP client.

    Example:
        >>> from datasluice import DataSluice
        >>> ds = DataSluice("https://catalog.data.gov")
        >>> results = ds.search("climate change")
        >>> for dataset in results:
        ...     print(dataset.title)
    """

    def __init__(
        self,
        portal_url: str,
        *,
        portal_type: str | None = None,
        auth: BaseAuth | None = None,
        settings: Settings | None = None,
        transport: HttpClient | None = None,
    ) -> None:
        self.settings = settings or load_settings()
        configure_logging(self.settings.log_level)

        self.auth = auth
        self._transport = transport or self._build_transport()
        self.adapter = create_adapter(portal_url, portal_type=portal_type, auth=auth)
        self.adapter._transport = self._transport

        self._downloader: Downloader | None = None
        logger.debug("Initialised DataSluice for %s (%s)", portal_url, self.adapter.portal_type)

    def _build_transport(self) -> HttpClient:
        """Construct the HTTP client from settings."""
        rate_limiter = RateLimiter(requests_per_second=self.settings.rate_limit) if self.settings.rate_limit else None
        retry_policy = RetryPolicy(max_attempts=self.settings.http_retries)
        return HttpClient(
            auth=self.auth,
            timeout=self.settings.http_timeout,
            retry_policy=retry_policy,
            rate_limiter=rate_limiter,
            user_agent=self.settings.user_agent,
        )

    @property
    def downloader(self) -> Downloader:
        """Lazily-initialised downloader."""
        if self._downloader is None:
            self._downloader = Downloader(self._transport)
        return self._downloader

    def search(self, query: str | Query | None = None, **kwargs: Any) -> SearchResult:
        """Search for datasets.

        Args:
            query: Search text or a :class:`Query` object.
            **kwargs: Additional :class:`Query` fields (limit, tags, etc.).

        Returns:
            A :class:`SearchResult` page.
        """
        if isinstance(query, Query):
            q = query
        else:
            q = Query(text=query, **kwargs)
        return self.adapter.search(q)

    def get_dataset(self, dataset_id: str) -> Dataset:
        """Fetch a single dataset by ID."""
        return self.adapter.get_dataset(dataset_id)

    def list_resources(self, dataset_id: str) -> list[Resource]:
        """List resources for a dataset."""
        return self.adapter.list_resources(dataset_id)

    def get_organization(self, organization_id: str) -> Organization:
        """Fetch organization/publisher metadata."""
        return self.adapter.get_organization(organization_id)

    def read(self, resource: Resource) -> list[dict[str, Any]]:
        """Download and parse a resource into a list of record dicts."""
        from datasluice.formats import get_reader

        if not resource.url:
            raise ValueError(f"Resource {resource.id!r} has no URL")
        data = self._transport.download(resource.url)
        fmt = (resource.format or "CSV").upper()
        reader = get_reader(fmt)
        return reader.read(data)

    def download(self, resource: Resource, dest: str | Path | None = None, **kwargs: Any) -> Path:
        """Download a single resource."""
        return self.downloader.download(resource, dest, **kwargs)

    def download_all(self, dataset: Dataset, dest: str | Path) -> list[Path]:
        """Download all resources in a dataset."""
        return self.downloader.download_many(dataset.resources, dest)

    def __repr__(self) -> str:
        return f"<DataSluice({self.adapter.portal_type!r}, {self.adapter.base_url!r})>"

downloader property

Lazily-initialised downloader.

download(resource, dest=None, **kwargs)

Download a single resource.

Source code in src/datasluice/client.py
def download(self, resource: Resource, dest: str | Path | None = None, **kwargs: Any) -> Path:
    """Download a single resource."""
    return self.downloader.download(resource, dest, **kwargs)

download_all(dataset, dest)

Download all resources in a dataset.

Source code in src/datasluice/client.py
def download_all(self, dataset: Dataset, dest: str | Path) -> list[Path]:
    """Download all resources in a dataset."""
    return self.downloader.download_many(dataset.resources, dest)

get_dataset(dataset_id)

Fetch a single dataset by ID.

Source code in src/datasluice/client.py
def get_dataset(self, dataset_id: str) -> Dataset:
    """Fetch a single dataset by ID."""
    return self.adapter.get_dataset(dataset_id)

get_organization(organization_id)

Fetch organization/publisher metadata.

Source code in src/datasluice/client.py
def get_organization(self, organization_id: str) -> Organization:
    """Fetch organization/publisher metadata."""
    return self.adapter.get_organization(organization_id)

list_resources(dataset_id)

List resources for a dataset.

Source code in src/datasluice/client.py
def list_resources(self, dataset_id: str) -> list[Resource]:
    """List resources for a dataset."""
    return self.adapter.list_resources(dataset_id)

read(resource)

Download and parse a resource into a list of record dicts.

Source code in src/datasluice/client.py
def read(self, resource: Resource) -> list[dict[str, Any]]:
    """Download and parse a resource into a list of record dicts."""
    from datasluice.formats import get_reader

    if not resource.url:
        raise ValueError(f"Resource {resource.id!r} has no URL")
    data = self._transport.download(resource.url)
    fmt = (resource.format or "CSV").upper()
    reader = get_reader(fmt)
    return reader.read(data)

search(query=None, **kwargs)

Search for datasets.

Parameters:

Name Type Description Default
query str | Query | None

Search text or a :class:Query object.

None
**kwargs Any

Additional :class:Query fields (limit, tags, etc.).

{}

Returns:

Name Type Description
A SearchResult

class:SearchResult page.

Source code in src/datasluice/client.py
def search(self, query: str | Query | None = None, **kwargs: Any) -> SearchResult:
    """Search for datasets.

    Args:
        query: Search text or a :class:`Query` object.
        **kwargs: Additional :class:`Query` fields (limit, tags, etc.).

    Returns:
        A :class:`SearchResult` page.
    """
    if isinstance(query, Query):
        q = query
    else:
        q = Query(text=query, **kwargs)
    return self.adapter.search(q)

DataSluiceError

Bases: Exception

Base exception for all DataSluice errors.

Source code in src/datasluice/exceptions.py
class DataSluiceError(Exception):
    """Base exception for all DataSluice errors."""

Dataset dataclass

A dataset is a logical grouping of one or more resources.

Attributes:

Name Type Description
id str

Portal-native dataset identifier.

title str | None

Human-readable dataset title.

name str | None

Machine-friendly slug or name.

description str | None

Longer free-text description (may contain Markdown/HTML).

resources list[Resource]

List of downloadable resources within this dataset.

organization Organization | None

Publishing organization, if known.

license License | None

Default license for resources in this dataset.

tags list[str]

Free-form tags or keywords.

themes list[str]

Categorization themes or groups.

language list[str]

ISO language code(s) for the data.

created str | None

ISO-8601 creation timestamp.

modified str | None

ISO-8601 last-modified timestamp.

metadata_modified str | None

ISO-8601 timestamp of last metadata change.

url str | None

Canonical URL to the dataset on the portal.

extra dict[str, Any]

Portal-native fields not captured above.

Source code in src/datasluice/domain/dataset.py
@dataclass(frozen=True)
class Dataset:
    """A dataset is a logical grouping of one or more resources.

    Attributes:
        id: Portal-native dataset identifier.
        title: Human-readable dataset title.
        name: Machine-friendly slug or name.
        description: Longer free-text description (may contain Markdown/HTML).
        resources: List of downloadable resources within this dataset.
        organization: Publishing organization, if known.
        license: Default license for resources in this dataset.
        tags: Free-form tags or keywords.
        themes: Categorization themes or groups.
        language: ISO language code(s) for the data.
        created: ISO-8601 creation timestamp.
        modified: ISO-8601 last-modified timestamp.
        metadata_modified: ISO-8601 timestamp of last metadata change.
        url: Canonical URL to the dataset on the portal.
        extra: Portal-native fields not captured above.
    """

    id: str
    title: str | None = None
    name: str | None = None
    description: str | None = None
    resources: list[Resource] = field(default_factory=list)
    organization: Organization | None = None
    license: License | None = None
    tags: list[str] = field(default_factory=list)
    themes: list[str] = field(default_factory=list)
    language: list[str] = field(default_factory=list)
    created: str | None = None
    modified: str | None = None
    metadata_modified: str | None = None
    url: str | None = None
    extra: dict[str, Any] = field(default_factory=dict)

DownloadError

Bases: DataSluiceError

Raised when a resource download fails.

Source code in src/datasluice/exceptions.py
class DownloadError(DataSluiceError):
    """Raised when a resource download fails."""

FormatError

Bases: DataSluiceError

Raised when a resource cannot be parsed in the expected format.

Source code in src/datasluice/exceptions.py
class FormatError(DataSluiceError):
    """Raised when a resource cannot be parsed in the expected format."""

License dataclass

A license under which an open-data resource or dataset is published.

Attributes:

Name Type Description
id str

Canonical license identifier (e.g. "CC-BY-4.0").

title str | None

Human-readable license name.

url str | None

URL to the full license text.

Source code in src/datasluice/domain/license.py
@dataclass(frozen=True)
class License:
    """A license under which an open-data resource or dataset is published.

    Attributes:
        id: Canonical license identifier (e.g. ``"CC-BY-4.0"``).
        title: Human-readable license name.
        url: URL to the full license text.
    """

    id: str
    title: str | None = None
    url: str | None = None

NotFoundError

Bases: PortalError

Raised when a requested dataset or resource does not exist.

Source code in src/datasluice/exceptions.py
class NotFoundError(PortalError):
    """Raised when a requested dataset or resource does not exist."""

Organization dataclass

An organization or publisher of open-data datasets.

Attributes:

Name Type Description
id str

Portal-native organization identifier.

name str | None

Display name of the organization.

title str | None

Alternative human-readable title.

description str | None

Longer description, if available.

url str | None

URL to the organization's page on the portal.

logo_url str | None

URL to the organization's logo image.

created str | None

ISO-8601 creation timestamp, if available.

extra dict[str, Any]

Portal-native fields not captured above.

Source code in src/datasluice/domain/organization.py
@dataclass(frozen=True)
class Organization:
    """An organization or publisher of open-data datasets.

    Attributes:
        id: Portal-native organization identifier.
        name: Display name of the organization.
        title: Alternative human-readable title.
        description: Longer description, if available.
        url: URL to the organization's page on the portal.
        logo_url: URL to the organization's logo image.
        created: ISO-8601 creation timestamp, if available.
        extra: Portal-native fields not captured above.
    """

    id: str
    name: str | None = None
    title: str | None = None
    description: str | None = None
    url: str | None = None
    logo_url: str | None = None
    created: str | None = None
    extra: dict[str, Any] = field(default_factory=dict)

PortalDetectionError

Bases: DataSluiceError

Raised when the portal type cannot be auto-detected.

Source code in src/datasluice/exceptions.py
class PortalDetectionError(DataSluiceError):
    """Raised when the portal type cannot be auto-detected."""

PortalError

Bases: DataSluiceError

Raised when a portal returns an error or is unreachable.

Source code in src/datasluice/exceptions.py
class PortalError(DataSluiceError):
    """Raised when a portal returns an error or is unreachable."""

Query dataclass

Portal-agnostic search parameters.

Attributes:

Name Type Description
text str | None

Free-text search query.

tags list[str]

Filter by one or more tags.

organizations list[str]

Filter by organization name(s).

groups list[str]

Filter by group or theme name(s).

res_format str | None

Filter by resource format (e.g. "CSV").

license_id str | None

Filter by license identifier.

sort str | None

Sort field and direction (e.g. "metadata_modified desc").

limit int

Maximum number of results to return.

offset int

Number of results to skip (for pagination).

Source code in src/datasluice/domain/query.py
@dataclass(frozen=True)
class Query:
    """Portal-agnostic search parameters.

    Attributes:
        text: Free-text search query.
        tags: Filter by one or more tags.
        organizations: Filter by organization name(s).
        groups: Filter by group or theme name(s).
        res_format: Filter by resource format (e.g. ``"CSV"``).
        license_id: Filter by license identifier.
        sort: Sort field and direction (e.g. ``"metadata_modified desc"``).
        limit: Maximum number of results to return.
        offset: Number of results to skip (for pagination).
    """

    text: str | None = None
    tags: list[str] = field(default_factory=list)
    organizations: list[str] = field(default_factory=list)
    groups: list[str] = field(default_factory=list)
    res_format: str | None = None
    license_id: str | None = None
    sort: str | None = None
    limit: int = 100
    offset: int = 0

RateLimitError

Bases: PortalError

Raised when the portal rate-limits requests.

Source code in src/datasluice/exceptions.py
class RateLimitError(PortalError):
    """Raised when the portal rate-limits requests."""

    def __init__(self, message: str, retry_after: float | None = None) -> None:
        super().__init__(message)
        self.retry_after = retry_after

Resource dataclass

A single downloadable resource (file) within a dataset.

Attributes:

Name Type Description
id str

Portal-native resource identifier.

name str | None

Human-readable resource name or title.

url str | None

Direct download URL.

format str | None

Canonical file format (e.g. "CSV", "JSON").

media_type str | None

IANA media type if known (e.g. "text/csv").

description str | None

Optional longer description.

size int | None

File size in bytes, if known.

license License | None

License under which this resource is published.

created str | None

ISO-8601 creation timestamp, if available.

modified str | None

ISO-8601 last-modified timestamp, if available.

extra dict[str, Any]

Portal-native fields not captured above.

Source code in src/datasluice/domain/resource.py
@dataclass(frozen=True)
class Resource:
    """A single downloadable resource (file) within a dataset.

    Attributes:
        id: Portal-native resource identifier.
        name: Human-readable resource name or title.
        url: Direct download URL.
        format: Canonical file format (e.g. ``"CSV"``, ``"JSON"``).
        media_type: IANA media type if known (e.g. ``"text/csv"``).
        description: Optional longer description.
        size: File size in bytes, if known.
        license: License under which this resource is published.
        created: ISO-8601 creation timestamp, if available.
        modified: ISO-8601 last-modified timestamp, if available.
        extra: Portal-native fields not captured above.
    """

    id: str
    name: str | None = None
    url: str | None = None
    format: str | None = None
    media_type: str | None = None
    description: str | None = None
    size: int | None = None
    license: License | None = None
    created: str | None = None
    modified: str | None = None
    extra: dict[str, Any] = field(default_factory=dict)

    @classmethod
    def normalize_format(cls, raw: str | None) -> str | None:
        """Normalise a raw format string or media type to canonical form."""
        if raw is None:
            return None
        return _FORMAT_ALIASES.get(raw.lower(), raw.upper().strip())

normalize_format(raw) classmethod

Normalise a raw format string or media type to canonical form.

Source code in src/datasluice/domain/resource.py
@classmethod
def normalize_format(cls, raw: str | None) -> str | None:
    """Normalise a raw format string or media type to canonical form."""
    if raw is None:
        return None
    return _FORMAT_ALIASES.get(raw.lower(), raw.upper().strip())

SearchResult dataclass

A paginated page of search results.

Attributes:

Name Type Description
datasets list[Dataset]

Datasets returned in this page.

total int

Total number of matching datasets across all pages.

page int

Current page number (1-based).

page_size int

Number of results per page.

has_next bool

Whether additional pages are available.

Source code in src/datasluice/domain/result.py
@dataclass
class SearchResult:
    """A paginated page of search results.

    Attributes:
        datasets: Datasets returned in this page.
        total: Total number of matching datasets across all pages.
        page: Current page number (1-based).
        page_size: Number of results per page.
        has_next: Whether additional pages are available.
    """

    datasets: list[Dataset] = field(default_factory=list)
    total: int = 0
    page: int = 1
    page_size: int = 100
    has_next: bool = False

    def __iter__(self) -> Iterator[Dataset]:
        return iter(self.datasets)

    def __len__(self) -> int:
        return len(self.datasets)