Skip to content

nih_chestxray

load_data

Load and return the NIH Chest X-ray dataset.

Parameters:

Name Type Description Default
root_dir Optional[str]

The path to store the downloaded data. When path is not provided, the data will be saved into fastestimator_data under the user's home directory.

None

Returns:

Type Description
DirDataset

train_data

Source code in fastestimator/fastestimator/dataset/data/nih_chestxray.py
def load_data(root_dir: Optional[str] = None) -> DirDataset:
    """Load and return the NIH Chest X-ray dataset.

    Args:
        root_dir: The path to store the downloaded data. When `path` is not provided, the data will be saved into
            `fastestimator_data` under the user's home directory.

    Returns:
        train_data
    """
    if root_dir is None:
        root_dir = os.path.join(str(Path.home()), 'fastestimator_data', 'NIH_Chestxray')
    else:
        root_dir = os.path.join(os.path.abspath(root_dir), 'NIH_Chestxray')
    os.makedirs(root_dir, exist_ok=True)

    image_extracted_path = os.path.join(root_dir, 'images')

    if not os.path.exists(image_extracted_path):
        # download data
        links = [
            'https://nihcc.box.com/shared/static/vfk49d74nhbxq3nqjg0900w5nvkorp5c.gz',
            'https://nihcc.box.com/shared/static/i28rlmbvmfjbl8p2n3ril0pptcmcu9d1.gz',
            'https://nihcc.box.com/shared/static/f1t00wrtdk94satdfb9olcolqx20z2jp.gz',
            'https://nihcc.box.com/shared/static/0aowwzs5lhjrceb3qp67ahp0rd1l1etg.gz',
            'https://nihcc.box.com/shared/static/v5e3goj22zr6h8tzualxfsqlqaygfbsn.gz',
            'https://nihcc.box.com/shared/static/asi7ikud9jwnkrnkj99jnpfkjdes7l6l.gz',
            'https://nihcc.box.com/shared/static/jn1b4mw4n6lnh74ovmcjb8y48h8xj07n.gz',
            'https://nihcc.box.com/shared/static/tvpxmn7qyrgl0w8wfh9kqfjskv6nmm1j.gz',
            'https://nihcc.box.com/shared/static/upyy3ml7qdumlgk2rfcvlb9k6gvqq2pj.gz',
            'https://nihcc.box.com/shared/static/l6nilvfa9cg3s28tqv1qc1olm3gnz54p.gz',
            'https://nihcc.box.com/shared/static/hhq8fkdgvcari67vfhs7ppg2w6ni4jze.gz',
            'https://nihcc.box.com/shared/static/ioqwiy20ihqwyr8pf4c24eazhh281pbu.gz'
        ]
        data_paths = [os.path.join(root_dir, "images_{}.tar.gz".format(x)) for x in range(len(links))]
        for idx, (link, data_path) in enumerate(zip(links, data_paths)):
            _download_data(link, data_path, idx, len(links))

        # extract data
        for idx, data_path in enumerate(data_paths):
            print("Extracting {}, file {} / {}".format(data_path, idx + 1, len(links)))
            with tarfile.open(data_path) as img_tar:
                img_tar.extractall(root_dir)

    return DirDataset(image_extracted_path, file_extension='.png', recursive_search=False)