API Reference¶

DataSluice — one Python interface for open-data discovery, extraction, format normalization, and pipeline integration.

`AdapterError` ¶

Bases: DataSluiceError

Raised when an adapter cannot fulfil a request.

Source code in src/datasluice/exceptions.py

class AdapterError(DataSluiceError):
    """Raised when an adapter cannot fulfil a request."""

`AdapterNotFoundError` ¶

Bases: AdapterError

Raised when no adapter is registered for a portal type.

Source code in src/datasluice/exceptions.py

class AdapterNotFoundError(AdapterError):
    """Raised when no adapter is registered for a portal type."""

`AuthenticationError` ¶

Bases: DataSluiceError

Raised when authentication credentials are missing or invalid.

Source code in src/datasluice/exceptions.py

class AuthenticationError(DataSluiceError):
    """Raised when authentication credentials are missing or invalid."""

`ChecksumMismatchError` ¶

Bases: DownloadError

Raised when a downloaded file's checksum does not match.

Source code in src/datasluice/exceptions.py

class ChecksumMismatchError(DownloadError):
    """Raised when a downloaded file's checksum does not match."""

    def __init__(self, message: str, expected: str | None = None, actual: str | None = None) -> None:
        super().__init__(message)
        self.expected = expected
        self.actual = actual

`ConfigError` ¶

Bases: DataSluiceError

Raised when configuration is invalid or incomplete.

Source code in src/datasluice/exceptions.py

class ConfigError(DataSluiceError):
    """Raised when configuration is invalid or incomplete."""

`DataSluice` ¶

Unified client for open-data portals.

Parameters:

Name	Type	Description	Default
`portal_url`	`str`	Base URL of the open-data portal.	required
`portal_type`	`str \| None`	Optional explicit portal type (e.g. `"ckan"`). Auto-detected when omitted.	`None`
`auth`	`BaseAuth \| None`	Optional authentication strategy.	`None`
`settings`	`Settings \| None`	Optional pre-loaded settings. Loaded from the environment when omitted.	`None`
`transport`	`HttpClient \| None`	Optional pre-configured HTTP client.	`None`

Example

from datasluice import DataSluice ds = DataSluice("https://catalog.data.gov") results = ds.search("climate change") for dataset in results: ... print(dataset.title)

Source code in src/datasluice/client.py

class DataSluice:
    """Unified client for open-data portals.

    Args:
        portal_url: Base URL of the open-data portal.
        portal_type: Optional explicit portal type (e.g. ``"ckan"``).
            Auto-detected when omitted.
        auth: Optional authentication strategy.
        settings: Optional pre-loaded settings. Loaded from the environment
            when omitted.
        transport: Optional pre-configured HTTP client.

    Example:
        >>> from datasluice import DataSluice
        >>> ds = DataSluice("https://catalog.data.gov")
        >>> results = ds.search("climate change")
        >>> for dataset in results:
        ...     print(dataset.title)
    """

    def __init__(
        self,
        portal_url: str,
        *,
        portal_type: str | None = None,
        auth: BaseAuth | None = None,
        settings: Settings | None = None,
        transport: HttpClient | None = None,
    ) -> None:
        self.settings = settings or load_settings()
        configure_logging(self.settings.log_level)

        self.auth = auth
        self._transport = transport or self._build_transport()
        self.adapter = create_adapter(portal_url, portal_type=portal_type, auth=auth)
        self.adapter._transport = self._transport

        self._downloader: Downloader | None = None
        logger.debug("Initialised DataSluice for %s (%s)", portal_url, self.adapter.portal_type)

    def _build_transport(self) -> HttpClient:
        """Construct the HTTP client from settings."""
        rate_limiter = RateLimiter(requests_per_second=self.settings.rate_limit) if self.settings.rate_limit else None
        retry_policy = RetryPolicy(max_attempts=self.settings.http_retries)
        return HttpClient(
            auth=self.auth,
            timeout=self.settings.http_timeout,
            retry_policy=retry_policy,
            rate_limiter=rate_limiter,
            user_agent=self.settings.user_agent,
        )

    @property
    def downloader(self) -> Downloader:
        """Lazily-initialised downloader."""
        if self._downloader is None:
            self._downloader = Downloader(self._transport)
        return self._downloader

    def search(self, query: str | Query | None = None, **kwargs: Any) -> SearchResult:
        """Search for datasets.

        Args:
            query: Search text or a :class:`Query` object.
            **kwargs: Additional :class:`Query` fields (limit, tags, etc.).

        Returns:
            A :class:`SearchResult` page.
        """
        if isinstance(query, Query):
            q = query
        else:
            q = Query(text=query, **kwargs)
        return self.adapter.search(q)

    def get_dataset(self, dataset_id: str) -> Dataset:
        """Fetch a single dataset by ID."""
        return self.adapter.get_dataset(dataset_id)

    def list_resources(self, dataset_id: str) -> list[Resource]:
        """List resources for a dataset."""
        return self.adapter.list_resources(dataset_id)

    def get_organization(self, organization_id: str) -> Organization:
        """Fetch organization/publisher metadata."""
        return self.adapter.get_organization(organization_id)

    def read(self, resource: Resource) -> list[dict[str, Any]]:
        """Download and parse a resource into a list of record dicts."""
        from datasluice.formats import get_reader

        if not resource.url:
            raise ValueError(f"Resource {resource.id!r} has no URL")
        data = self._transport.download(resource.url)
        fmt = (resource.format or "CSV").upper()
        reader = get_reader(fmt)
        return reader.read(data)

    def download(self, resource: Resource, dest: str | Path | None = None, **kwargs: Any) -> Path:
        """Download a single resource."""
        return self.downloader.download(resource, dest, **kwargs)

    def download_all(self, dataset: Dataset, dest: str | Path) -> list[Path]:
        """Download all resources in a dataset."""
        return self.downloader.download_many(dataset.resources, dest)

    def __repr__(self) -> str:
        return f"<DataSluice({self.adapter.portal_type!r}, {self.adapter.base_url!r})>"

`downloader` `property` ¶

Lazily-initialised downloader.

`download(resource, dest=None, **kwargs)` ¶

Download a single resource.

Source code in src/datasluice/client.py

def download(self, resource: Resource, dest: str | Path | None = None, **kwargs: Any) -> Path:
    """Download a single resource."""
    return self.downloader.download(resource, dest, **kwargs)

`download_all(dataset, dest)` ¶

Download all resources in a dataset.

Source code in src/datasluice/client.py

def download_all(self, dataset: Dataset, dest: str | Path) -> list[Path]:
    """Download all resources in a dataset."""
    return self.downloader.download_many(dataset.resources, dest)

`get_dataset(dataset_id)` ¶

Fetch a single dataset by ID.

Source code in src/datasluice/client.py

def get_dataset(self, dataset_id: str) -> Dataset:
    """Fetch a single dataset by ID."""
    return self.adapter.get_dataset(dataset_id)

`get_organization(organization_id)` ¶

Fetch organization/publisher metadata.

Source code in src/datasluice/client.py

def get_organization(self, organization_id: str) -> Organization:
    """Fetch organization/publisher metadata."""
    return self.adapter.get_organization(organization_id)

`list_resources(dataset_id)` ¶

List resources for a dataset.

Source code in src/datasluice/client.py

def list_resources(self, dataset_id: str) -> list[Resource]:
    """List resources for a dataset."""
    return self.adapter.list_resources(dataset_id)

`read(resource)` ¶

Download and parse a resource into a list of record dicts.

Source code in src/datasluice/client.py

def read(self, resource: Resource) -> list[dict[str, Any]]:
    """Download and parse a resource into a list of record dicts."""
    from datasluice.formats import get_reader

    if not resource.url:
        raise ValueError(f"Resource {resource.id!r} has no URL")
    data = self._transport.download(resource.url)
    fmt = (resource.format or "CSV").upper()
    reader = get_reader(fmt)
    return reader.read(data)

`search(query=None, **kwargs)` ¶

Search for datasets.

Parameters:

Name	Type	Description	Default
`query`	`str \| Query \| None`	Search text or a :class:`Query` object.	`None`
`**kwargs`	`Any`	Additional :class:`Query` fields (limit, tags, etc.).	`{}`

Returns:

Name	Type	Description
`A`	`SearchResult`	class:`SearchResult` page.

Source code in src/datasluice/client.py

def search(self, query: str | Query | None = None, **kwargs: Any) -> SearchResult:
    """Search for datasets.

    Args:
        query: Search text or a :class:`Query` object.
        **kwargs: Additional :class:`Query` fields (limit, tags, etc.).

    Returns:
        A :class:`SearchResult` page.
    """
    if isinstance(query, Query):
        q = query
    else:
        q = Query(text=query, **kwargs)
    return self.adapter.search(q)

`DataSluiceError` ¶

Bases: Exception

Base exception for all DataSluice errors.

Source code in src/datasluice/exceptions.py

class DataSluiceError(Exception):
    """Base exception for all DataSluice errors."""

`Dataset` `dataclass` ¶

A dataset is a logical grouping of one or more resources.

Attributes:

Name	Type	Description
`id`	`str`	Portal-native dataset identifier.
`title`	`str \| None`	Human-readable dataset title.
`name`	`str \| None`	Machine-friendly slug or name.
`description`	`str \| None`	Longer free-text description (may contain Markdown/HTML).
`resources`	`list[Resource]`	List of downloadable resources within this dataset.
`organization`	`Organization \| None`	Publishing organization, if known.
`license`	`License \| None`	Default license for resources in this dataset.
`tags`	`list[str]`	Free-form tags or keywords.
`themes`	`list[str]`	Categorization themes or groups.
`language`	`list[str]`	ISO language code(s) for the data.
`created`	`str \| None`	ISO-8601 creation timestamp.
`modified`	`str \| None`	ISO-8601 last-modified timestamp.
`metadata_modified`	`str \| None`	ISO-8601 timestamp of last metadata change.
`url`	`str \| None`	Canonical URL to the dataset on the portal.
`extra`	`dict[str, Any]`	Portal-native fields not captured above.

Source code in src/datasluice/domain/dataset.py

@dataclass(frozen=True)
class Dataset:
    """A dataset is a logical grouping of one or more resources.

    Attributes:
        id: Portal-native dataset identifier.
        title: Human-readable dataset title.
        name: Machine-friendly slug or name.
        description: Longer free-text description (may contain Markdown/HTML).
        resources: List of downloadable resources within this dataset.
        organization: Publishing organization, if known.
        license: Default license for resources in this dataset.
        tags: Free-form tags or keywords.
        themes: Categorization themes or groups.
        language: ISO language code(s) for the data.
        created: ISO-8601 creation timestamp.
        modified: ISO-8601 last-modified timestamp.
        metadata_modified: ISO-8601 timestamp of last metadata change.
        url: Canonical URL to the dataset on the portal.
        extra: Portal-native fields not captured above.
    """

    id: str
    title: str | None = None
    name: str | None = None
    description: str | None = None
    resources: list[Resource] = field(default_factory=list)
    organization: Organization | None = None
    license: License | None = None
    tags: list[str] = field(default_factory=list)
    themes: list[str] = field(default_factory=list)
    language: list[str] = field(default_factory=list)
    created: str | None = None
    modified: str | None = None
    metadata_modified: str | None = None
    url: str | None = None
    extra: dict[str, Any] = field(default_factory=dict)

`DownloadError` ¶

Bases: DataSluiceError

Raised when a resource download fails.

Source code in src/datasluice/exceptions.py

class DownloadError(DataSluiceError):
    """Raised when a resource download fails."""

`FormatError` ¶

Bases: DataSluiceError

Raised when a resource cannot be parsed in the expected format.

Source code in src/datasluice/exceptions.py

class FormatError(DataSluiceError):
    """Raised when a resource cannot be parsed in the expected format."""

`License` `dataclass` ¶

A license under which an open-data resource or dataset is published.

Attributes:

Name	Type	Description
`id`	`str`	Canonical license identifier (e.g. `"CC-BY-4.0"`).
`title`	`str \| None`	Human-readable license name.
`url`	`str \| None`	URL to the full license text.

Source code in src/datasluice/domain/license.py

@dataclass(frozen=True)
class License:
    """A license under which an open-data resource or dataset is published.

    Attributes:
        id: Canonical license identifier (e.g. ``"CC-BY-4.0"``).
        title: Human-readable license name.
        url: URL to the full license text.
    """

    id: str
    title: str | None = None
    url: str | None = None

`NotFoundError` ¶

Bases: PortalError

Raised when a requested dataset or resource does not exist.

Source code in src/datasluice/exceptions.py

class NotFoundError(PortalError):
    """Raised when a requested dataset or resource does not exist."""

`Organization` `dataclass` ¶

An organization or publisher of open-data datasets.

Attributes:

Name	Type	Description
`id`	`str`	Portal-native organization identifier.
`name`	`str \| None`	Display name of the organization.
`title`	`str \| None`	Alternative human-readable title.
`description`	`str \| None`	Longer description, if available.
`url`	`str \| None`	URL to the organization's page on the portal.
`logo_url`	`str \| None`	URL to the organization's logo image.
`created`	`str \| None`	ISO-8601 creation timestamp, if available.
`extra`	`dict[str, Any]`	Portal-native fields not captured above.

Source code in src/datasluice/domain/organization.py

@dataclass(frozen=True)
class Organization:
    """An organization or publisher of open-data datasets.

    Attributes:
        id: Portal-native organization identifier.
        name: Display name of the organization.
        title: Alternative human-readable title.
        description: Longer description, if available.
        url: URL to the organization's page on the portal.
        logo_url: URL to the organization's logo image.
        created: ISO-8601 creation timestamp, if available.
        extra: Portal-native fields not captured above.
    """

    id: str
    name: str | None = None
    title: str | None = None
    description: str | None = None
    url: str | None = None
    logo_url: str | None = None
    created: str | None = None
    extra: dict[str, Any] = field(default_factory=dict)

`PortalDetectionError` ¶

Bases: DataSluiceError

Raised when the portal type cannot be auto-detected.

Source code in src/datasluice/exceptions.py

class PortalDetectionError(DataSluiceError):
    """Raised when the portal type cannot be auto-detected."""

`PortalError` ¶

Bases: DataSluiceError

Raised when a portal returns an error or is unreachable.

Source code in src/datasluice/exceptions.py

class PortalError(DataSluiceError):
    """Raised when a portal returns an error or is unreachable."""

`Query` `dataclass` ¶

Portal-agnostic search parameters.

Attributes:

Name	Type	Description
`text`	`str \| None`	Free-text search query.
`tags`	`list[str]`	Filter by one or more tags.
`organizations`	`list[str]`	Filter by organization name(s).
`groups`	`list[str]`	Filter by group or theme name(s).
`res_format`	`str \| None`	Filter by resource format (e.g. `"CSV"`).
`license_id`	`str \| None`	Filter by license identifier.
`sort`	`str \| None`	Sort field and direction (e.g. `"metadata_modified desc"`).
`limit`	`int`	Maximum number of results to return.
`offset`	`int`	Number of results to skip (for pagination).

Source code in src/datasluice/domain/query.py

@dataclass(frozen=True)
class Query:
    """Portal-agnostic search parameters.

    Attributes:
        text: Free-text search query.
        tags: Filter by one or more tags.
        organizations: Filter by organization name(s).
        groups: Filter by group or theme name(s).
        res_format: Filter by resource format (e.g. ``"CSV"``).
        license_id: Filter by license identifier.
        sort: Sort field and direction (e.g. ``"metadata_modified desc"``).
        limit: Maximum number of results to return.
        offset: Number of results to skip (for pagination).
    """

    text: str | None = None
    tags: list[str] = field(default_factory=list)
    organizations: list[str] = field(default_factory=list)
    groups: list[str] = field(default_factory=list)
    res_format: str | None = None
    license_id: str | None = None
    sort: str | None = None
    limit: int = 100
    offset: int = 0

`RateLimitError` ¶

Bases: PortalError

Raised when the portal rate-limits requests.

Source code in src/datasluice/exceptions.py

class RateLimitError(PortalError):
    """Raised when the portal rate-limits requests."""

    def __init__(self, message: str, retry_after: float | None = None) -> None:
        super().__init__(message)
        self.retry_after = retry_after

`Resource` `dataclass` ¶

A single downloadable resource (file) within a dataset.

Attributes:

Name	Type	Description
`id`	`str`	Portal-native resource identifier.
`name`	`str \| None`	Human-readable resource name or title.
`url`	`str \| None`	Direct download URL.
`format`	`str \| None`	Canonical file format (e.g. `"CSV"`, `"JSON"`).
`media_type`	`str \| None`	IANA media type if known (e.g. `"text/csv"`).
`description`	`str \| None`	Optional longer description.
`size`	`int \| None`	File size in bytes, if known.
`license`	`License \| None`	License under which this resource is published.
`created`	`str \| None`	ISO-8601 creation timestamp, if available.
`modified`	`str \| None`	ISO-8601 last-modified timestamp, if available.
`extra`	`dict[str, Any]`	Portal-native fields not captured above.

Source code in src/datasluice/domain/resource.py

@dataclass(frozen=True)
class Resource:
    """A single downloadable resource (file) within a dataset.

    Attributes:
        id: Portal-native resource identifier.
        name: Human-readable resource name or title.
        url: Direct download URL.
        format: Canonical file format (e.g. ``"CSV"``, ``"JSON"``).
        media_type: IANA media type if known (e.g. ``"text/csv"``).
        description: Optional longer description.
        size: File size in bytes, if known.
        license: License under which this resource is published.
        created: ISO-8601 creation timestamp, if available.
        modified: ISO-8601 last-modified timestamp, if available.
        extra: Portal-native fields not captured above.
    """

    id: str
    name: str | None = None
    url: str | None = None
    format: str | None = None
    media_type: str | None = None
    description: str | None = None
    size: int | None = None
    license: License | None = None
    created: str | None = None
    modified: str | None = None
    extra: dict[str, Any] = field(default_factory=dict)

    @classmethod
    def normalize_format(cls, raw: str | None) -> str | None:
        """Normalise a raw format string or media type to canonical form."""
        if raw is None:
            return None
        return _FORMAT_ALIASES.get(raw.lower(), raw.upper().strip())

`normalize_format(raw)` `classmethod` ¶

Normalise a raw format string or media type to canonical form.

Source code in src/datasluice/domain/resource.py

@classmethod
def normalize_format(cls, raw: str | None) -> str | None:
    """Normalise a raw format string or media type to canonical form."""
    if raw is None:
        return None
    return _FORMAT_ALIASES.get(raw.lower(), raw.upper().strip())

`SearchResult` `dataclass` ¶

A paginated page of search results.

Attributes:

Name	Type	Description
`datasets`	`list[Dataset]`	Datasets returned in this page.
`total`	`int`	Total number of matching datasets across all pages.
`page`	`int`	Current page number (1-based).
`page_size`	`int`	Number of results per page.
`has_next`	`bool`	Whether additional pages are available.

Source code in src/datasluice/domain/result.py

@dataclass
class SearchResult:
    """A paginated page of search results.

    Attributes:
        datasets: Datasets returned in this page.
        total: Total number of matching datasets across all pages.
        page: Current page number (1-based).
        page_size: Number of results per page.
        has_next: Whether additional pages are available.
    """

    datasets: list[Dataset] = field(default_factory=list)
    total: int = 0
    page: int = 1
    page_size: int = 100
    has_next: bool = False

    def __iter__(self) -> Iterator[Dataset]:
        return iter(self.datasets)

    def __len__(self) -> int:
        return len(self.datasets)

API Reference¶

AdapterError ¶

AdapterNotFoundError ¶

AuthenticationError ¶

ChecksumMismatchError ¶

ConfigError ¶

DataSluice ¶

downloader property ¶

download(resource, dest=None, **kwargs) ¶

download_all(dataset, dest) ¶

get_dataset(dataset_id) ¶

get_organization(organization_id) ¶

list_resources(dataset_id) ¶

read(resource) ¶

search(query=None, **kwargs) ¶

DataSluiceError ¶

Dataset dataclass ¶

DownloadError ¶

FormatError ¶

License dataclass ¶

NotFoundError ¶

Organization dataclass ¶

PortalDetectionError ¶

PortalError ¶

Query dataclass ¶

RateLimitError ¶

Resource dataclass ¶

normalize_format(raw) classmethod ¶

SearchResult dataclass ¶

`AdapterError` ¶

`AdapterNotFoundError` ¶

`AuthenticationError` ¶

`ChecksumMismatchError` ¶

`ConfigError` ¶

`DataSluice` ¶

`downloader` `property` ¶

`download(resource, dest=None, **kwargs)` ¶

`download_all(dataset, dest)` ¶

`get_dataset(dataset_id)` ¶

`get_organization(organization_id)` ¶

`list_resources(dataset_id)` ¶

`read(resource)` ¶

`search(query=None, **kwargs)` ¶

`DataSluiceError` ¶

`Dataset` `dataclass` ¶

`DownloadError` ¶

`FormatError` ¶

`License` `dataclass` ¶

`NotFoundError` ¶

`Organization` `dataclass` ¶

`PortalDetectionError` ¶

`PortalError` ¶

`Query` `dataclass` ¶

`RateLimitError` ¶

`Resource` `dataclass` ¶

`normalize_format(raw)` `classmethod` ¶

`SearchResult` `dataclass` ¶