Hi!
We have moved all the model servers to kserve 0.10, and nsfw as well with:
https://gerrit.wikimedia.org/r/c/machinelearning/liftwing/inference-services/+/894006
For some reason, when I try to hit the model with an HTTP request I get a hang in its predict() function. While testing in localhost with Docker, I took this py-bt trace:
(gdb) py-bt Traceback (most recent call first): <built-in method TFE_Py_Execute of PyCapsule object at remote 0x7f65093f7c60> File "/opt/lib/python/site-packages/tensorflow/python/eager/execute.py", line 314, in quick_execute File "/opt/lib/python/site-packages/tensorflow/python/eager/function.py", line 1366, in call File "/opt/lib/python/site-packages/tensorflow/python/eager/function.py", line 4263, in _call_flat File "/opt/lib/python/site-packages/tensorflow/python/eager/def_function.py", line 2005, in _call File "/opt/lib/python/site-packages/tensorflow/python/eager/def_function.py", line 1934, in __call__ File "/opt/lib/python/site-packages/tensorflow/python/util/traceback_utils.py", line 150, in error_handler File "/opt/lib/python/site-packages/keras/engine/training.py", line 3069, in predict File "/opt/lib/python/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler File "/srv/nsfw-model/model-server/model.py", line 314, in predict File "/opt/lib/python/site-packages/kserve/model.py", line 629, in __call__ File "/opt/lib/python/site-packages/kserve/protocol/dataplane.py", line 276, in infer File "/opt/lib/python/site-packages/kserve/protocol/rest/v1_endpoints.py", line 69, in predict File "/opt/lib/python/site-packages/fastapi/routing.py", line 161, in run_endpoint_function File "/opt/lib/python/site-packages/fastapi/routing.py", line 1771, in app File "/opt/lib/python/site-packages/starlette/routing.py", line 66, in app File "/opt/lib/python/site-packages/starlette/routing.py", line 532, in handle File "/opt/lib/python/site-packages/starlette/routing.py", line 706, in __call__ File "/opt/lib/python/site-packages/fastapi/middleware/asyncexitstack.py", line 18, in __call__ File "/opt/lib/python/site-packages/starlette/middleware/exceptions.py", line 68, in __call__ File "/opt/lib/python/site-packages/timing_asgi/middleware.py", line 324, in __call__ File "/opt/lib/python/site-packages/starlette/middleware/errors.py", line 162, in __call__ File "/opt/lib/python/site-packages/starlette/applications.py", line 124, in __call__ File "/opt/lib/python/site-packages/fastapi/applications.py", line 270, in __call__ File "/opt/lib/python/site-packages/uvicorn/middleware/proxy_headers.py", line 331, in __call__ File "/opt/lib/python/site-packages/uvicorn/protocols/http/h11_impl.py", line 373, in run_asgi <built-in method run of Context object at remote 0x7f64f8c41d80> File "/usr/lib/python3.9/asyncio/events.py", line 80, in _run self._context.run(self._callback, *self._args) File "/usr/lib/python3.9/asyncio/base_events.py", line 2658, in _run_once File "/usr/lib/python3.9/asyncio/base_events.py", line 852, in run_forever getaddr_func = self._getaddrinfo_debug File "/usr/lib/python3.9/asyncio/base_events.py", line 629, in run_until_complete self.run_forever() File "/usr/lib/python3.9/asyncio/runners.py", line 300, in run File "/opt/lib/python/site-packages/kserve/protocol/rest/server.py", line 180, in run File "/usr/lib/python3.9/multiprocessing/process.py", line 571, in _bootstrap File "/usr/lib/python3.9/multiprocessing/popen_fork.py", line 71, in _launch code = process_obj._bootstrap(parent_sentinel=child_r) File "/usr/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__ self._launch(process_obj) File "/usr/lib/python3.9/multiprocessing/context.py", line 277, in _Popen return Popen(process_obj) File "/usr/lib/python3.9/multiprocessing/context.py", line 224, in _Popen return _default_context.get_context().Process._Popen(process_obj) File "/usr/lib/python3.9/multiprocessing/process.py", line 633, in start File "/opt/lib/python/site-packages/kserve/model_server.py", line 396, in serve <built-in method run of Context object at remote 0x7f64f8a16d00> --Type <RET> for more, q to quit, c to continue without paging--c File "/usr/lib/python3.9/asyncio/events.py", line 80, in _run self._context.run(self._callback, *self._args) File "/usr/lib/python3.9/asyncio/base_events.py", line 2658, in _run_once File "/usr/lib/python3.9/asyncio/base_events.py", line 852, in run_forever getaddr_func = self._getaddrinfo_debug File "/usr/lib/python3.9/asyncio/base_events.py", line 629, in run_until_complete self.run_forever() File "/usr/lib/python3.9/asyncio/runners.py", line 300, in run File "/opt/lib/python/site-packages/kserve/model_server.py", line 404, in start File "/srv/nsfw-model/model-server/model.py", line 69, in <module>
The changes should only be related to request routing, so I am not sure why tensorflow hangs.. @Htriedman do you have any idea / suggestion about how to debug it? Thanks in advance for the help :)