跳到内容

资源

资源

基类:BaseModel

表示用于下载和缓存测试数据的远程资源的类。

源代码位于 bionemo/core/data/resource.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
class Resource(pydantic.BaseModel):
    """Class that represents a remote resource for downloading and caching test data."""

    model_config = pydantic.ConfigDict(use_attribute_docstrings=True)

    tag: Annotated[str, pydantic.StringConstraints(pattern=r"^[^/]*/[^/]*$")]  # Only slash between filename and tag.
    """A unique identifier for the resource. The file(s) will be accessible via load("filename/tag")."""

    ngc: Annotated[str, pydantic.AfterValidator(_validate_ngc_resource)] | None = None
    """The NGC URL for the resource.

    Should be in format [org/[team/]]name[:version]. If None, the resource is not available on NGC.
    """

    ngc_registry: Literal["model", "resource"] | None = None
    """The NGC resource type (model or resource) for the data. Must be provided if ngc is not None."""

    pbss: Annotated[pydantic.AnyUrl, pydantic.UrlConstraints(allowed_schemes=["s3"])]
    """The PBSS (NVIDIA-internal) URL of the resource."""

    sha256: str | None
    """The SHA256 checksum of the resource. If None, the SHA will not be checked on download (not recommended)."""

    owner: pydantic.NameEmail
    """The owner or primary point of contact for the resource, in the format "Name <email>"."""

    description: str | None = None
    """A description of the file(s)."""

    unpack: Literal[False, None] = None
    """Whether the resource should be unpacked after download. If None, will defer to the file extension."""

    decompress: Literal[False, None] = None
    """Whether the resource should be decompressed after download. If None, will defer to the file extension."""

    @pydantic.model_validator(mode="after")
    def _validate_ngc_registry(self):
        if self.ngc and not self.ngc_registry:
            raise ValueError(f"ngc_registry must be provided if ngc is not None: {self.tag}")
        return self

decompress: Literal[False, None] = None class-attribute instance-attribute

是否应在下载后解压缩资源。如果为 None,将参考文件扩展名。

description: str | None = None class-attribute instance-attribute

文件描述。

ngc: Annotated[str, pydantic.AfterValidator(_validate_ngc_resource)] | None = None class-attribute instance-attribute

资源的 NGC URL。

应为 [org/[团队/]]名称[:版本] 格式。如果为 None,则资源在 NGC 上不可用。

ngc_registry: Literal['model', 'resource'] | None = None class-attribute instance-attribute

数据的 NGC 资源类型(模型或资源)。如果 ngc 不为 None,则必须提供。

owner: pydantic.NameEmail instance-attribute

资源的拥有者或主要联系人,格式为“姓名".

pbss: Annotated[pydantic.AnyUrl, pydantic.UrlConstraints(allowed_schemes=[s3])] instance-attribute

资源的 PBSS (NVIDIA 内部) URL。

sha256: str | None instance-attribute

资源的 SHA256 校验和。如果为 None,则下载时将不检查 SHA(不推荐)。

tag: Annotated[str, pydantic.StringConstraints(pattern='^[^/]*/[^/]*$')] instance-attribute

资源的唯一标识符。文件可以通过 load("filename/tag") 访问。

unpack: Literal[False, None] = None class-attribute instance-attribute

是否应在下载后解包资源。如果为 None,将参考文件扩展名。

get_all_resources(resource_path=None) cached

返回所有资源的字典。

源代码位于 bionemo/core/data/resource.py
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
@functools.cache
def get_all_resources(resource_path: Path | None = None) -> dict[str, Resource]:
    """Return a dictionary of all resources."""
    if not resource_path:
        resource_path = Path(files("bionemo.core.data").joinpath("resources"))  # type: ignore

    resources_files = itertools.chain(resource_path.glob("*.yaml"), resource_path.glob("*.yml"))

    all_resources = [resource for file in resources_files for resource in _parse_resource_file(file)]

    resource_list = pydantic.TypeAdapter(list[Resource]).validate_python(all_resources)
    resource_dict = {resource.tag: resource for resource in resource_list}

    if len(resource_dict) != len(resource_list):
        # Show the # of and which ones are duplicated so that a user can begin debugging and resolve the issue.
        tag_counts = Counter([resource.tag for resource in resource_list])
        raise ValueError(f"Duplicate resource tags found!: {[tag for tag, count in tag_counts.items() if count > 1]}")

    return resource_dict