Page MenuHomePhabricator

widespread failures, possibly related to float64
Closed, ResolvedPublicBUG REPORT

Description

Steps to replicate the issue (include links if applicable):

What happens?:
Too many failures.
Here's an example of the error display:

image.png (281×397 px, 15 KB)

What should have happened instead?:
Tests should pass

Software version (on Special:Version page; skip for WMF-hosted wikis like Wikipedia):

Other information (browser name/version, screenshots, etc.):

Event Timeline

Here's another clue: pi/4.0 gives
"Error in the function call API (propagated error: "Status 503 from wikilambda_fetch (ZIDs <Z21033>)")
Error type: Error in the function call API
Error data:
propagated error: "Status 503 from wikilambda_fetch (ZIDs <Z21033>)""

Also Integer conversion to js is broken, since sometimes negative integers are converted to positive ones (see Z33627)

There have been no changes to the converters in the last few months.

That fixed the failing JS test on https://www.wikifunctions.org/view/en/Z20849

Probably same issue on Python.

There was no change to relevant code in the orchestrator in this week's deploy, so it's likely caused by this week's evaluators deploy; almost all of that is in the Rust version; the entirety of the Node-version diff is:

$ git diff 5f6281d^..9c2ffcc -- :^rustversion :^.pipeline :^executors/wasm-utilities :^test :^test_data
diff --git a/lib/util.js b/lib/util.js
index da2106d..1b64e47 100644
--- a/lib/util.js
+++ b/lib/util.js
@@ -3,7 +3,7 @@
 const express = require( 'express' );
 const { v4: uuidv4 } = require( 'uuid' );
 const bunyan = require( 'bunyan' );
-const { execSync } = require( 'child_process' );
+const { Evaluator } = require( '../src/Evaluator.js' );
 
 /**
  * Error instance wrapping HTTP error responses
@@ -347,46 +347,20 @@ function createWasmSubProcessCount( app ) {
 		}
 	} );
 
-	const { count, error } = getWasmSubProcessCount();
+	const count = Evaluator.getGlobalEvaluator().getTotalExecutorCount();
 
-	if ( count !== 0 ) {
-		// log count as well for now
-		app.logger.log(
-			'debug',
-			{ message: `number of currently running WASM subprocesses: ${ count }` }
-		);
-	} else {
-		app.logger.log(
-			'error',
-			{ message: `error counting WASM subprocesses: ${ error }` }
-		);
-	}
+	app.logger.log(
+		'debug',
+		{ message: `number of currently running WASM subprocesses: ${ count }` }
+	);
 	// Set value of the Gauge to the count
 	metricsInstance.set( count );
 }
 
-/**
- * Creates a Gauge metric to count any unknown and active subprocesses.
- * Logs the count as well and if any errors arise, and returns the number.
- *
- * @return {Object.<number, ?Error>} The count of running WASM processes, and any error if occurred
- */
-function getWasmSubProcessCount() {
-	try {
-		// Run pgrep to count number of WASM subprocesses
-		// TODO (T411481): Improve subprocess counting method to be more robust
-		return { count: parseInt( execSync( 'pgrep -f wasm | wc -l' ).toString(), 10 ) - 1, error: null };
-	} catch ( error ) {
-		// Default value in case of error.
-		return { count: 0, error };
-	}
-}
-
 module.exports = {
 	createWasmSubProcessCount,
 	HTTPError,
 	initAndLogRequest,
-	getWasmSubProcessCount,
 	sendOutgoingResponseCountMetrics,
 	setHeartBeatMetric,
 	setErrorHandler,
diff --git a/routes/evaluate.js b/routes/evaluate.js
index 01e2c5b..cce61a1 100644
--- a/routes/evaluate.js
+++ b/routes/evaluate.js
@@ -2,6 +2,7 @@
 
 const sUtil = require( '../lib/util' );
 
+const { convertJSONToSafeDecircularLogString } = require( '../src/utils.js' );
 const { rateLimitMiddleware } = require( '../middleware/rateLimitMiddleware.js' );
 const { transformRequestBodyMiddleware } = require( '../middleware/exchangeFormat.js' );
 
@@ -50,17 +51,22 @@ router.post( '/', [ transformRequestBodyMiddleware, rateLimitMiddleware ], async
 	const errorFromResult = getZErrorTypeFromResult( result );
 	if ( errorFromResult ) {
 		statusCode = getHttpStatusCode( errorFromResult );
-		const stringifiedReducedRequest = JSON.stringify( functionCallRequest ).slice( 0, 10000 );
-		const stringifiedReducedResult = JSON.stringify( result ).slice( 0, 10000 );
-		// log payload details for 5xx's
-		const payloadInfo = ( statusCode >= 500 && statusCode < 600 ) ?
-			{ method: req.method, info: stringifiedReducedRequest,
-				errorResult: stringifiedReducedResult } :
-			{ method: req.method };
+
+		const payloadInfo = { method: req.method };
+
+		// log payload details for 5xxs
+		if ( statusCode >= 500 && statusCode < 600 ) {
+			payloadInfo.info = convertJSONToSafeDecircularLogString( functionCallRequest );
+			payloadInfo.errorResult = convertJSONToSafeDecircularLogString( result );
+		}
+
 		app.logger.log( 'warn',
-			{ message: `Returning HTTP status: ${ statusCode } due to error in Evaluator`,
-				requestId: req.context.reqId, labels: payloadInfo
-			} );
+			{
+				message: `Returning HTTP status: ${ statusCode } due to error in Evaluator`,
+				requestId: req.context.reqId,
+				labels: payloadInfo
+			}
+		);
 	}
 
 	if ( res.writableEnded || result === null ) {
diff --git a/routes/info.js b/routes/info.js
index fba85d2..067e129 100644
--- a/routes/info.js
+++ b/routes/info.js
@@ -1,5 +1,7 @@
 'use strict';
 
+const { Evaluator } = require( '../src/Evaluator.js' );
+const { HTTPStatus } = require( '../executors/javascript-wasmedge/function-schemata/javascript/src/error.js' );
 const sUtil = require( '../lib/util' );
 
 /**
@@ -18,11 +20,11 @@ let app;
  */
 router.get( '/', ( req, res ) => {
 
-	// TODO (T408899): If this isn't > 0, return an HTTP 500 so k8s doesn't send us more traffic?
-	const currentWASMSubProcessCount = sUtil.getWasmSubProcessCount().count;
+	const currentWASMSubProcessCount = Evaluator.getGlobalEvaluator().getTotalExecutorCount();
 
-	// simple sync return
-	res.json( {
+	const statusCode = currentWASMSubProcessCount > 0 ?
+		HTTPStatus.OK : HTTPStatus.SERVICE_UNAVAILABLE;
+	res.status( statusCode ).json( {
 		name: app.info.name,
 		version: app.info.version,
 		description: app.info.description,
diff --git a/src/Evaluator.js b/src/Evaluator.js
index 13d2338..caa3ca9 100644
--- a/src/Evaluator.js
+++ b/src/Evaluator.js
@@ -101,6 +101,10 @@ class Evaluator {
 		return globalEvaluator;
 	}
 
+	getTotalExecutorCount() {
+		return this.executorPools_.reduce( ( sum, pool ) => sum + pool.getPoolSize(), 0 );
+	}
+
 	async getExecutorFor( codingLanguage ) {
 		const index = this.codingLanguageToPoolIndex_.get( codingLanguage );
 		let executorPool;
@@ -243,8 +247,7 @@ class Evaluator {
 			[ 'evaluationStartTime', startTimeStr ],
 			[ 'evaluationEndTime', endTimeStr ],
 			[ 'evaluationDuration', durationStr ],
-			[ 'evaluationHostname', hostname ],
-			[ 'evaluationMemoryUsage', memoryUsageStr ]
+			[ 'evaluationHostname', hostname ]
 		].map( ( keyAndValue ) => keyAndValue.map( wrapInZ6 ) );
 		setMetadataValues( result, metadataValues );
 
diff --git a/src/ExecutorPool.js b/src/ExecutorPool.js
index 92aee30..36d7a58 100644
--- a/src/ExecutorPool.js
+++ b/src/ExecutorPool.js
@@ -78,6 +78,10 @@ class ExecutorPool {
      * We need IMMEDIATE access to the executor class itself, so it can't be
      * blocked on the promise.
      */
+	getPoolSize() {
+		return this.freeExecutors_.length;
+	}
+
 	async getExecutor() {
 		const lock = await this.freeExecutorMutex_.acquire();
 		try {
diff --git a/src/utils.js b/src/utils.js
index fdd795f..538f7d5 100644
--- a/src/utils.js
+++ b/src/utils.js
@@ -1,9 +1,31 @@
 'use strict';
 
+const { convertJSONToSafeLogString } = require( '../executors/javascript-wasmedge/function-schemata/javascript/src/utils' );
+
 async function sleep( ms ) {
 	await new Promise( ( resolve ) => {
 		setTimeout( resolve, ms );
 	} );
 }
 
-module.exports = { sleep };
+function convertJSONToSafeDecircularLogString( obj ) {
+	// If we encounter the same object reference twice, it's a circular reference
+	// so we can return the string '[Circular]' instead of recursing infinitely.
+	//
+	// WeakSet (vs Set) is used so held references don't block garbage collection
+	// once stringify finishes and `seen` goes out of scope.
+	const seen = new WeakSet();
+	const circularReplacer = ( key, value ) => {
+		if ( typeof value === 'object' && value !== null ) {
+			if ( seen.has( value ) ) {
+				return '[Circular]';
+			}
+			seen.add( value );
+		}
+		return value;
+	};
+
+	return convertJSONToSafeLogString( obj, circularReplacer );
+}
+
+module.exports = { sleep, convertJSONToSafeDecircularLogString };

(i.e. just a logging and _info count).

Huh, Weird. I wonder why the output changed then.

On https://www.wikifunctions.org/view/en/Z20849 the test https://www.wikifunctions.org/view/en/Z21466 still fails for Python.

When you look at the results, you can see that the results are equivalent. So this is probably an issue in the equality test for floats. This case should be added as an equality test to https://www.wikifunctions.org/view/en/Z20850 and then fixed.

Taking a break from this for now, I plan to come back, but if someone else fixes it in the meantime, I will be happy.

OK, fixed that too. The JS float converter needed some more fixing as well:

Now all the tests on the functions mentioned in the bug are working again.

I wouldn't be surprised if there are other types that also don't work anymore. Please make a new bug and reach out to me in that case.

Closing this one for now.

DVrandecic claimed this task.

Thanks for the responsiveness. We'll surely let you know if it happens on other Types!