Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def error_check(errors):
if num_nodes == 1:
# In a single-node setting, each object is evicted and
# reconstructed exactly once, so exactly half the objects will
# produce an error during reconstruction.
min_errors = num_objects // 2
else:
# In a multinode setting, each object is evicted zero or one
# times, so some of the nondeterministic tasks may not be
# reexecuted.
min_errors = 1
return len(errors) >= min_errors
errors = wait_for_errors(error_check)
# Make sure all the errors have the correct type.
assert all(error["type"] == ray_constants.HASH_MISMATCH_PUSH_ERROR
for error in errors)
assert cluster.remaining_processes_alive()
value = ray.get(args[i])
assert value[0] == i
# Get each value starting from the beginning to force reconstruction.
# Currently, since we're not able to reconstruct `ray.put` objects that
# were evicted and whose originating tasks are still running, this
# for-loop should hang on its first iteration and push an error to the
# driver.
ray.worker.global_worker.raylet_client.fetch_or_reconstruct([args[0]],
False)
def error_check(errors):
return len(errors) > 1
errors = wait_for_errors(error_check)
assert all(error["type"] == ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR
for error in errors)
memory=None,
object_store_memory=None,
redis_max_memory=None,
redis_port=None,
redis_shard_ports=None,
object_manager_port=None,
node_manager_port=None,
node_ip_address=None,
object_id_seed=None,
local_mode=False,
driver_mode=None,
redirect_worker_output=None,
redirect_output=None,
num_redis_shards=None,
redis_max_clients=None,
redis_password=ray_constants.REDIS_DEFAULT_PASSWORD,
plasma_directory=None,
worker_path=None,
huge_pages=False,
include_webui=None,
webui_host="127.0.0.1",
logging_level=logging.INFO,
logging_format=ray_constants.LOGGER_FORMAT,
plasma_store_socket_name=None,
raylet_socket_name=None,
temp_dir=None,
include_log_monitor=None,
autoscaling_config=None,
include_java=False,
java_worker_options=None,
load_code_from_local=False,
use_pickle=False,
actor.save_checkpoint(actor_id, checkpoint_id)
if (len(checkpoint_info.checkpoint_ids) >
ray._config.num_actor_checkpoints_to_keep()):
actor.checkpoint_expired(
actor_id,
checkpoint_info.checkpoint_ids.pop(0),
)
checkpoint_info.num_tasks_since_last_checkpoint = 0
checkpoint_info.last_checkpoint_timestamp = now
except Exception:
# Checkpoint save or reload failed. Notify the driver.
traceback_str = ray.utils.format_error_message(
traceback.format_exc())
ray.utils.push_error_to_driver(
self._worker,
ray_constants.CHECKPOINT_PUSH_ERROR,
traceback_str,
job_id=self._worker.current_job_id)
guaranteed when max_concurrency > 1.
name: The globally unique name for the actor.
detached: Whether the actor should be kept alive after driver
exits.
is_asyncio: Turn on async actor calls. This only works with direct
actor calls.
Returns:
A handle to the newly created actor.
"""
if args is None:
args = []
if kwargs is None:
kwargs = {}
if is_direct_call is None:
is_direct_call = ray_constants.direct_call_enabled()
if max_concurrency is None:
if is_asyncio:
max_concurrency = 100
else:
max_concurrency = 1
if max_concurrency > 1 and not is_direct_call:
raise ValueError(
"setting max_concurrency requires is_direct_call=True")
if max_concurrency < 1:
raise ValueError("max_concurrency must be >= 1")
if is_asyncio and not is_direct_call:
raise ValueError(
"Setting is_asyncio requires is_direct_call=True.")
default=ray_constants.LOGGER_LEVEL,
type=str,
help=ray_constants.LOGGER_LEVEL_HELP)
@click.option(
"--logging-format",
required=False,
default=ray_constants.LOGGER_FORMAT,
type=str,
help=ray_constants.LOGGER_FORMAT_HELP)
def cli(logging_level, logging_format):
level = logging.getLevelName(logging_level.upper())
ray.utils.setup_logger(level, logging_format)
self._max_retries = (DEFAULT_REMOTE_FUNCTION_NUM_TASK_RETRIES
if max_retries is None else max_retries)
self._decorator = getattr(function, "__ray_invocation_decorator__",
None)
self._function_signature = ray.signature.extract_signature(
self._function)
self._last_export_session_and_job = None
# Override task.remote's signature and docstring
@wraps(function)
def _remote_proxy(*args, **kwargs):
return self._remote(args=args, kwargs=kwargs)
self.remote = _remote_proxy
self.direct_call_enabled = ray_constants.direct_call_enabled()
while counter < num_retries:
if counter > 0:
logger.warning("Redis failed to start, retrying now.")
# Construct the command to start the Redis server.
command = [executable]
if password:
if " " in password:
raise ValueError("Spaces not permitted in redis password.")
command += ["--requirepass", password]
command += (
["--port", str(port), "--loglevel", "warning"] + load_module_args)
process_info = start_ray_process(
command,
ray_constants.PROCESS_TYPE_REDIS_SERVER,
stdout_file=stdout_file,
stderr_file=stderr_file)
time.sleep(0.1)
# Check if Redis successfully started (or at least if it the executable
# did not exit within 0.1 seconds).
if process_info.process.poll() is None:
break
port = new_port()
counter += 1
if counter == num_retries:
raise Exception("Couldn't start Redis. Check log files: {} {}".format(
stdout_file.name, stderr_file.name))
# Create a Redis client just for configuring Redis.
redis_client = redis.StrictRedis(
host="127.0.0.1", port=port, password=password)
time.sleep(0.5)
if not resources:
# NOTE: This hides the possibility that Ray may be waiting for
# clients to connect.
resources.setdefault("CPU", 0)
resources.setdefault("GPU", 0)
logger.warning("Cluster resources cannot be detected or are 0. "
"You can resume this experiment by passing in "
"`resume=True` to `run`.")
resources = resources.copy()
num_cpus = resources.pop("CPU", 0)
num_gpus = resources.pop("GPU", 0)
memory = ray_constants.from_memory_units(resources.pop("memory", 0))
object_store_memory = ray_constants.from_memory_units(
resources.pop("object_store_memory", 0))
custom_resources = resources
self._avail_resources = Resources(
int(num_cpus),
int(num_gpus),
memory=int(memory),
object_store_memory=int(object_store_memory),
custom_resources=custom_resources)
self._last_resource_refresh = time.time()
self._resources_initialized = True
if mode == LOCAL_MODE:
worker.local_mode_manager = LocalModeManager()
return
# For driver's check that the version information matches the version
# information that the Ray cluster was started with.
try:
ray.services.check_version_info(worker.redis_client)
except Exception as e:
if mode == SCRIPT_MODE:
raise e
elif mode == WORKER_MODE:
traceback_str = traceback.format_exc()
ray.utils.push_error_to_driver_through_redis(
worker.redis_client,
ray_constants.VERSION_MISMATCH_PUSH_ERROR,
traceback_str,
job_id=None)
worker.lock = threading.RLock()
# Create an object for interfacing with the global state.
ray.state.state._initialize_global_state(
node.redis_address, redis_password=node.redis_password)
# Register the worker with Redis.
if mode == SCRIPT_MODE:
# The concept of a driver is the same as the concept of a "job".
# Register the driver/job with Redis here.
import __main__ as main
driver_info = {
"node_ip_address": node.node_ip_address,