How to use the tensorflowonspark.util.get_ip_address function in tensorflowonspark

To help you get started, we’ve selected a few tensorflowonspark examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github yahoo / TensorFlowOnSpark / test / test_reservation.py View on Github external
def test_reservation_enviroment_not_exists_get_server_ip_return_actual_host_ip(self):
    tfso_server = Server(5)
    assert tfso_server.get_server_ip() == util.get_ip_address()
github yahoo / TensorFlowOnSpark / tensorflowonspark / reservation.py View on Github external
def get_server_ip(self):
    return os.getenv(TFOS_SERVER_HOST) if os.getenv(TFOS_SERVER_HOST) else util.get_ip_address()
github yahoo / TensorFlowOnSpark / tensorflowonspark / TFSparkNode.py View on Github external
def _train(iter):
    # get shared queue, reconnecting if necessary
    mgr = _get_manager(cluster_info, util.get_ip_address(), util.read_executor_id())
    try:
      queue = mgr.get_queue(qname)
      equeue = mgr.get_queue('error')
    except (AttributeError, KeyError):
      msg = "Queue '{}' not found on this node, check for exceptions on other nodes.".format(qname)
      raise Exception(msg)

    state = str(mgr.get('state'))
    logger.info("mgr.state={0}".format(state))
    terminating = state == "'terminating'"
    if terminating:
      logger.info("mgr is terminating, skipping partition")
      count = sum(1 for item in iter)
      logger.info("Skipped {0} items from partition".format(count))
    else:
      logger.info("Feeding partition {0} into {1} queue {2}".format(iter, qname, queue))
github yahoo / TensorFlowOnSpark / tensorflowonspark / TFSparkNode.py View on Github external
def _inference(iter):
    # get shared queue, reconnecting if necessary
    mgr = _get_manager(cluster_info, util.get_ip_address(), util.read_executor_id())
    try:
      queue_in = mgr.get_queue(qname)
      equeue = mgr.get_queue('error')
    except (AttributeError, KeyError):
      msg = "Queue '{}' not found on this node, check for exceptions on other nodes.".format(qname)
      raise Exception(msg)

    logger.info("Feeding partition {0} into {1} queue {2}".format(iter, qname, queue_in))
    count = 0
    for item in iter:
      count += 1
      queue_in.put(item, block=True)

    # signal "end of partition"
    queue_in.put(marker.EndPartition())
github yahoo / TensorFlowOnSpark / tensorflowonspark / TFSparkNode.py View on Github external
def _shutdown(iter):
    host = util.get_ip_address()
    executor_id = util.read_executor_id()

    # reconnect to shared queue
    mgr = _get_manager(cluster_info, host, executor_id)

    # send SIGTERM to Tensorboard proc (if running)
    for node in cluster_info:
      if node['host'] == host and node['executor_id'] == executor_id:
        tb_pid = node['tb_pid']
        if tb_pid != 0:
          logger.info("Stopping tensorboard (pid={0})".format(tb_pid))
          subprocess.Popen(["kill", str(tb_pid)])

    # terminate any listening queues
    logger.info("Stopping all queues")
    for q in queues:
github yahoo / TensorFlowOnSpark / tensorflowonspark / TFSparkNode.py View on Github external
gpus_to_use = gpu_info.get_gpus(num_gpus)

    # assign TF job/task based on provided cluster_spec template (or use default/null values)
    job_name = 'default'
    task_index = -1
    cluster_id = cluster_meta['id']
    cluster_template = cluster_meta['cluster_template']
    for jobtype in cluster_template:
      nodes = cluster_template[jobtype]
      if executor_id in nodes:
        job_name = jobtype
        task_index = nodes.index(executor_id)
        break

    # get unique key (hostname, executor_id) for this executor
    host = util.get_ip_address()
    util.write_executor_id(executor_id)
    port = 0

    # check for existing TFManagers
    if TFSparkNode.mgr is not None and str(TFSparkNode.mgr.get('state')) != "'stopped'":
      if TFSparkNode.cluster_id == cluster_id:
        # raise an exception to force Spark to retry this "reservation" task on another executor
        raise Exception("TFManager already started on {0}, executor={1}, state={2}".format(host, executor_id, str(TFSparkNode.mgr.get("state"))))
      else:
        # old state, just continue with creating new manager
        logger.warn("Ignoring old TFManager with cluster_id {0}, requested cluster_id {1}".format(TFSparkNode.cluster_id, cluster_id))

    # start a TFManager and get a free port
    # use a random uuid as the authkey
    authkey = uuid.uuid4().bytes
    addr = None