Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
self.stats_monitor = StatsPluginContext(self.etcd, self.local_config)
self.error_monitor = ErrorPluginContext(self.etcd, self.local_config)
await self.stats_monitor.init()
await self.error_monitor.init()
backend = self.local_config['agent']['backend']
agent_mod = importlib.import_module(f"ai.backend.agent.{backend.value}")
self.agent = await agent_mod.get_agent_cls().new( # type: ignore
self.etcd,
self.local_config,
stats_monitor=self.stats_monitor,
error_monitor=self.error_monitor,
)
rpc_addr = self.local_config['agent']['rpc-listen-addr']
self.rpc_server = Peer(
bind=ZeroMQAddress(f"tcp://{rpc_addr}"),
transport=ZeroMQRPCTransport,
scheduler=ExitOrderedAsyncScheduler(),
serializer=msgpack.packb,
deserializer=msgpack.unpackb,
debug_rpc=self.local_config['debug']['enabled'],
)
for func_name in self.rpc_function.functions:
self.rpc_server.handle_function(func_name, getattr(self, func_name))
log.info('started handling RPC requests at {}', rpc_addr)
await self.etcd.put('ip', rpc_addr.host, scope=ConfigScopes.NODE)
watcher_port = utils.nmget(self.local_config, 'watcher.service-addr.port', None)
if watcher_port is not None:
await self.etcd.put('watcher_port', watcher_port, scope=ConfigScopes.NODE)
AGENT_RESOURCE_OCCUPYING_KERNEL_STATUSES,
USER_RESOURCE_OCCUPYING_KERNEL_STATUSES,
DEAD_KERNEL_STATUSES,
)
if TYPE_CHECKING:
from .scheduler import SchedulingContext, PendingSession, AgentAllocationContext
from ..gateway.events import EventDispatcher
__all__ = ['AgentRegistry', 'InstanceNotFound']
log = BraceStyleAdapter(logging.getLogger('ai.backend.manager.registry'))
agent_peers: MutableMapping[str, zmq.asyncio.Socket] = {} # agent-addr to socket
class PeerInvoker(Peer):
class _CallStub:
_cached_funcs: Dict[str, Callable]
order_key: ContextVar[Optional[str]]
def __init__(self, peer: Peer):
self._cached_funcs = {}
self.peer = peer
self.order_key = ContextVar('order_key', default=None)
def __getattr__(self, name: str):
if f := self._cached_funcs.get(name, None):
return f
else:
async def _wrapped(*args, **kwargs):
port = int(_port)
except ValueError:
raise ValueError('Invalid port number', port)
if port <= 1024:
raise ValueError('Service port number must be larger than 1024.')
if port in (2000, 2001):
raise ValueError('Service port 2000 and 2001 is reserved for internal use.')
return {
'name': name,
'protocol': protocol,
'container_ports': (port,),
'host_ports': None, # determined after container start
}
class PeerInvoker(Peer):
class _CallStub:
_cached_funcs: Dict[str, Callable]
order_key: ContextVar[Optional[str]]
def __init__(self, peer: Peer):
self._cached_funcs = {}
self.peer = peer
self.order_key = ContextVar('order_key', default=None)
def __getattr__(self, name: str):
if f := self._cached_funcs.get(name, None): # noqa
return f
else:
async def _wrapped(*args, **kwargs):