1

I am new to Dask and I run into problems when executing the example code:

from dask.distributed import Client
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster()
client = Client(cluster)

I would get the following error:

AttributeError                            Traceback (most recent call last)
File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/deploy/spec.py:319, in SpecCluster._start(self)
    318     cls = import_term(cls)
--> 319 self.scheduler = cls(**self.scheduler_spec.get("options", {}))
    320 self.scheduler = await self.scheduler

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/scheduler.py:3481, in Scheduler.__init__(self, loop, delete_interval, synchronize_worker_interval, services, service_kwargs, allowed_failures, extensions, validate, scheduler_file, security, worker_ttl, idle_timeout, interface, host, port, protocol, dashboard_address, dashboard, http_prefix, preload, preload_argv, plugins, contact_address, transition_counter_max, jupyter, **kwargs)
   3480 if show_dashboard:
-> 3481     distributed.dashboard.scheduler.connect(
   3482         self.http_application, self.http_server, self, prefix=http_prefix
   3483     )
   3484 self.jupyter = jupyter

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/dashboard/scheduler.py:158, in connect(application, http_server, scheduler, prefix)
    156 def connect(application, http_server, scheduler, prefix=""):
    157     bokeh_app = BokehApplication(
--> 158         applications, scheduler, prefix=prefix, template_variables=template_variables()
    159     )
    160     application.add_application(bokeh_app)

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/cytoolz/functoolz.pyx:475, in cytoolz.functoolz._memoize.__call__()

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/dashboard/scheduler.py:131, in template_variables()
    123 from distributed.diagnostics.nvml import device_get_count
    125 template_variables = {
    126     "pages": [
    127         "status",
    128         "workers",
    129         "tasks",
    130         "system",
--> 131         *(["gpu"] if device_get_count() > 0 else []),
    132         "profile",
    133         "graph",
    134         "groups",
    135         "info",
    136     ],
    137     "plots": [
    138         {
    139             "url": x.strip("/"),
    140             "name": " ".join(x.strip("/").split("-")[1:])
    141             .title()
    142             .replace("Cpu", "CPU")
    143             .replace("Gpu", "GPU"),
    144         }
    145         for x in applications
    146         if "individual" in x
    147     ]
    148     + [{"url": "hardware", "name": "Hardware"}],
    149 }
    150 template_variables["plots"] = sorted(
    151     template_variables["plots"], key=lambda d: d["name"]
    152 )

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/diagnostics/nvml.py:126, in device_get_count()
    125 def device_get_count():
--> 126     init_once()
    127     if not is_initialized():

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/diagnostics/nvml.py:108, in init_once()
    105     return
    107 if _in_wsl() and parse_version(
--> 108     pynvml.nvmlSystemGetDriverVersion().decode()
    109 ) < parse_version(MINIMUM_WSL_VERSION):
    110     NVML_STATE = NVMLState.DISABLED_WSL_INSUFFICIENT_DRIVER

AttributeError: 'str' object has no attribute 'decode'

The above exception was the direct cause of the following exception:

RuntimeError                              Traceback (most recent call last)
Cell In[22], line 3
      1 from dask_cuda import LocalCUDACluster
----> 3 cluster = LocalCUDACluster()
      4 client = Client(cluster)

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/dask_cuda/local_cuda_cluster.py:336, in LocalCUDACluster.__init__(self, CUDA_VISIBLE_DEVICES, n_workers, threads_per_worker, memory_limit, device_memory_limit, data, local_directory, shared_filesystem, protocol, enable_tcp_over_ucx, enable_infiniband, enable_nvlink, enable_rdmacm, rmm_pool_size, rmm_maximum_pool_size, rmm_managed_memory, rmm_async, rmm_log_directory, rmm_track_allocations, jit_unspill, log_spilling, worker_class, pre_import, **kwargs)
    329     worker_class = partial(
    330         LoggedNanny if log_spilling is True else Nanny,
    331         worker_class=worker_class,
    332     )
    334 self.pre_import = pre_import
--> 336 super().__init__(
    337     n_workers=0,
    338     threads_per_worker=threads_per_worker,
    339     memory_limit=self.memory_limit,
    340     processes=True,
    341     data=data,
    342     local_directory=local_directory,
    343     protocol=protocol,
    344     worker_class=worker_class,
    345     config={
    346         "distributed.comm.ucx": get_ucx_config(
    347             enable_tcp_over_ucx=enable_tcp_over_ucx,
    348             enable_nvlink=enable_nvlink,
    349             enable_infiniband=enable_infiniband,
    350             enable_rdmacm=enable_rdmacm,
    351         )
    352     },
    353     **kwargs,
    354 )
    356 self.new_spec["options"]["preload"] = self.new_spec["options"].get(
    357     "preload", []
    358 ) + ["dask_cuda.initialize"]
    359 self.new_spec["options"]["preload_argv"] = self.new_spec["options"].get(
    360     "preload_argv", []
    361 ) + ["--create-cuda-context"]

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/deploy/local.py:253, in LocalCluster.__init__(self, name, n_workers, threads_per_worker, processes, loop, start, host, ip, scheduler_port, silence_logs, dashboard_address, worker_dashboard_address, diagnostics_port, services, worker_services, service_kwargs, asynchronous, security, protocol, blocked_handlers, interface, worker_class, scheduler_kwargs, scheduler_sync_interval, **worker_kwargs)
    250 worker = {"cls": worker_class, "options": worker_kwargs}
    251 workers = {i: worker for i in range(n_workers)}
--> 253 super().__init__(
    254     name=name,
    255     scheduler=scheduler,
    256     workers=workers,
    257     worker=worker,
    258     loop=loop,
    259     asynchronous=asynchronous,
    260     silence_logs=silence_logs,
    261     security=security,
    262     scheduler_sync_interval=scheduler_sync_interval,
    263 )

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/deploy/spec.py:286, in SpecCluster.__init__(self, workers, scheduler, worker, asynchronous, loop, security, silence_logs, name, shutdown_on_close, scheduler_sync_interval)
    284 if not called_from_running_loop:
    285     self._loop_runner.start()
--> 286     self.sync(self._start)
    287     try:
    288         self.sync(self._correct_state)

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/utils.py:338, in SyncMethodMixin.sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    336     return future
    337 else:
--> 338     return sync(
    339         self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
    340     )

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/utils.py:405, in sync(loop, func, callback_timeout, *args, **kwargs)
    403 if error:
    404     typ, exc, tb = error
--> 405     raise exc.with_traceback(tb)
    406 else:
    407     return result

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/utils.py:378, in sync.<locals>.f()
    376         future = asyncio.wait_for(future, callback_timeout)
    377     future = asyncio.ensure_future(future)
--> 378     result = yield future
    379 except Exception:
    380     error = sys.exc_info()

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/tornado/gen.py:769, in Runner.run(self)
    766 exc_info = None
    768 try:
--> 769     value = future.result()
    770 except Exception:
    771     exc_info = sys.exc_info()

File ~/miniconda3/envs/rapids-23.04/lib/python3.10/site-packages/distributed/deploy/spec.py:330, in SpecCluster._start(self)
    328 self.status = Status.failed
    329 await self._close()
--> 330 raise RuntimeError(f"Cluster failed to start: {e}") from e

RuntimeError: Cluster failed to start: 'str' object has no attribute 'decode'

The Dask version I have is Dask Version: 2023.2.0.

I tried to reinstall rapidsai, downgrade my python version from 3.10 to 3.8. and I also tried different parameters for LocalCUDACluster(), but none of these worked.

1
  • Are these all installed using conda from the same channel or did you mix and match with pip? Commented Feb 24, 2023 at 8:43

1 Answer 1

1

There was an unexpected breaking change in pyvnml that impacted dask-cuda. Dask-cuda has issued a hotfix release (23.02.01) to solve this in the stable release.

I see you're using the nightly packages. In the nightly packages, this should have been resolved by this PR. I'm not able to reproduce your issue in the following environment: mamba create -n rapids-23.04 -c rapidsai-nightly -c nvidia -c conda-forge rapids=23.04 python=3.8 cudatoolkit=11.5 jupyterlab strings_udf.

If you still experience this problem in a fresh environment, please file a dask-cuda Github issue.

Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.