Page MenuHomePhabricator

"Duplicate entry '0' for key 'proxy'"
Closed, DeclinedPublic3 Estimated Story PointsBUG REPORT

Description

What is the problem?

Some combinations of data will lead to the below exception. I can only reproduce this if I run import-data.js after running init-db.js. The reproduction data below is the same as for T339324.

/srv/service/node_modules/mariadb/lib/misc/errors.js:61
  return new SqlError(msg, sql, fatal, info, sqlState, errno, additionalStack, addHeader);
         ^

SqlError: (conn=157, no: 1062, SQLState: 23000) Duplicate entry '0' for key 'proxy'
sql: INSERT INTO proxies (proxy) VALUES (?); - parameters:[false]
    at Object.module.exports.createError (/srv/service/node_modules/mariadb/lib/misc/errors.js:61:10)
    at PacketNodeEncoded.readError (/srv/service/node_modules/mariadb/lib/io/packet.js:511:19)
    at Query.readResponsePacket (/srv/service/node_modules/mariadb/lib/cmd/resultset.js:46:28)
    at PacketInputStream.receivePacketBasic (/srv/service/node_modules/mariadb/lib/io/packet-input-stream.js:104:9)
    at PacketInputStream.onData (/srv/service/node_modules/mariadb/lib/io/packet-input-stream.js:169:20)
    at Socket.emit (node:events:513:28)
    at addChunk (node:internal/streams/readable:315:12)
    at readableAddChunk (node:internal/streams/readable:289:9)
    at Socket.Readable.push (node:internal/streams/readable:228:10)
    at TCP.onStreamRead (node:internal/stream_base_commons:190:23) {
  text: "Duplicate entry '0' for key 'proxy'",
  sql: 'INSERT INTO proxies (proxy) VALUES (?); - parameters:[false]',
  fatal: false,
  errno: 1062,
  sqlState: '23000',
  code: 'ER_DUP_ENTRY'
}
Steps to reproduce problem
  1. Save the JSON from "Reproduction data" below as a .gz file (e.g. type_too_big.json.gz) into the ipoid/tmp directory
  2. If necessary, start up docker in the ipoid directory (e.g. docker compose up -d)
  3. Initialise the database: docker compose exec web node init-db.js
  4. Run: docker compose exec -e FEED_PATH=./tmp/type_too_big.json.gz web node import-data.js
Environment

ipoid commit deed45d61626f952ce8b9365e12e8bd98703fa17

Reproduction data
{"as": {}, "client": {"behaviors": [null, -106, "\u0003", -30479], "concentration": {"country": true, "skew": -15301, "state": null}, "count": false, "countries": -23218, "proxies": [false, true, false, "\ud9ff\ude86\ud997\udd27\u00d6?\ud92f\udf71\u00f7\u00c7\u00ec\u00d0%\u00dfMX\ud9f6\uddb1+\u000bPB\u00d1\ud8b2\ude56"], "spread": "", "types": ["HEADLESS", "DESKTOP", "HEADLESS", "HEADLESS"]}, "infrastructure": null, "location": {"city": "\udb68\udf27", "country": "\u00fc\u00f0|\u00a8\udb53\ude5a\ud977\udf0c\u00ac&X\ud956\udc18\u001b\udbd1\uddda", "state": "\u0083"}, "organization": "\u00d3\u00a1", "risks": ["WEB_SCRAPING", "WEB_SCRAPING", "WEB_SCRAPING", "WEB_SCRAPING", "GEO_MISMATCH", "LOGIN_BRUTEFORCE", "CALLBACK_PROXY", "WEB_SCRAPING", "TUNNEL", "GEO_MISMATCH", "TUNNEL"], "services": [false, null], "tunnels": [{"anonymous": true, "entries": [true, null, "\udbdf\udd91\u00e90", "\u0082", "\u00a2f\uda3b\udf65\u009c\"\r", false, false, "#", 30612], "operator": "\u00a2", "type": "\ud8a8\uded2.\b"}, {"anonymous": "\ud8c2\udfda", "entries": ["\\", "\u00cb#_\u007f", "a", -8932], "operator": "\u00bf;\u00b8\u000f\u000b\u00c2\u00c8\u00de\ud84b\udcce", "type": "\u00ab"}, {"anonymous": 14808, "entries": ["\udb9b\uded9\u0013\u00de", null, false, true, -7, false, 5732, null, null], "operator": "\u009c", "type": "\u00da\u00a6\u00c2\nJ1\u00a8y\u0097\u00eaH}\u00b3\u00e6\udbf2\udd6f\u0016"}, {"anonymous": "\uda4c\udf55", "entries": [-16, "u"], "operator": 1882290103, "type": "\udb56\udec2\u0000"}, {"anonymous": "\ud8a9\udefb", "entries": [], "operator": "\u00c9\u00b0", "type": "~"}, {"anonymous": -96936237097570309219903303351460102886, "entries": ["\u009eQ\udb9f\udc14"], "operator": "\u009b", "type": null}, {"anonymous": "", "entries": ["\u001d\u00f2\udbc9\udeaa \u008c", true, "\u0005\ud8bf\udda6\uda34\udc51", false, false, null, null, false, 21388, null, 54, -28538, null], "operator": "\uda68\udd1c", "type": "\ua965"}], "ip": "182c7264-8dfc-4f78-8ff1-8bf12fca19a4"}
{"as": {"Organization": false}, "client": {"behaviors": [], "concentration": {"country": 10376}, "count": -10809, "countries": false, "proxies": [], "spread": null, "types": ["IOT", "DESKTOP", "MOBILE", "HEADLESS", "MOBILE", "HEADLESS"]}, "infrastructure": null, "location": {"state": true, "city": false, "country": true}, "organization": "\u00a8\udb49\udda8", "risks": ["CALLBACK_PROXY", "GEO_MISMATCH", "LOGIN_BRUTEFORCE"], "services": ["\udae9\udda4W", "\u00aa", false, true, "\u00c3", false, null], "tunnels": [{"anonymous": "\uda00\udcf9", "entries": ["\udbd2\udd75\u008a\u00b8\u00dc", "\u875e", true, null, "\u00b2\u00147<\u001eZ", 25457, "\u8eca"], "operator": -307550890, "type": "\u000bC\u00e5;\u0084\u0013\u00bee"}, {"anonymous": "(n\u00c4S\u00a5\ud9f8\udedf\u00ac\u00f9\u0012\u00b6", "entries": [null, "", -1162566802, null, 1966272010, false], "operator": false, "type": true}, {"anonymous": "\u00bb\uda98\udc86\ud98f\udc7d", "entries": [false, "\u00f7\u00b0\u00a8", "{Lg\u0085\udb98\udef0\u00e0", "", "\u00b8", "\u00f2", 6619], "operator": "\u0001", "type": "\udaf9\udc1c"}, {"anonymous": null, "entries": ["\ud998\ude1b", null], "operator": null, "type": "\u00be"}, {"anonymous": null, "entries": ["}", 6367086318877159199, "\ud9ed\ude85"], "operator": null, "type": false}, {"anonymous": "p", "entries": ["\u00d2", true, "\ud837\uddb5\ud942\udf7a\u00e4", "\u0088\u00f5u\u00d9\"\u00b7\u00a3\ud923\udf30\udac1\udf09\ud8e5\udd32\u00ac\u00b1\u00d1", true, null, 808646626, true, 14965], "operator": -84, "type": "\u000eU\ud8a1\ude18\udb6a\udd8a<\ud8c7\udfae\ud8f8\ude56Ti\uda8b\udf10\u00f5\udb7b\udf05"}, {"anonymous": "\u00e0\u00e5\udbb3\udc29\u00c4\udb68\udc2d\u00b2\u0096\u00da\u0091", "entries": [-103, 5152, "t", true, null, -41, "r\u00e5\u0099\u00f9", -118, "\u00f5", "\u00cd\u00d06\udb7b\udf76\u0001", "L", -7853, "\u00de", null, "\u00f3"], "operator": "p\u00b5\u0087", "type": -11008}, {"anonymous": 95, "entries": [true, "1\u0006", "\uda2d\udcc4", -15444], "operator": "\u00a2r\u0091\u00d0\u00d7\ud927\udec4\udb67\udd13}\udbb2\udf5d\udb05\udf81\u00b2", "type": null}], "ip": "93ffc461-ac49-43e1-bdc5-e7cad79851f3"}

Event Timeline

I have also seen this error for a legitimate proxy entry. The exception only appears intermittently (for 10 tries I got it 7 times):

/srv/service/node_modules/mariadb/lib/misc/errors.js:61
  return new SqlError(msg, sql, fatal, info, sqlState, errno, additionalStack, addHeader);
         ^

SqlError: (conn=1902, no: 1062, SQLState: 23000) Duplicate entry 'LUMINATI_PROXY' for key 'proxy'
sql: INSERT INTO proxies (proxy) VALUES (?); - parameters:['LUMINATI_PROXY']
    at Object.module.exports.createError (/srv/service/node_modules/mariadb/lib/misc/errors.js:61:10)
    at PacketNodeEncoded.readError (/srv/service/node_modules/mariadb/lib/io/packet.js:511:19)
    at Query.readResponsePacket (/srv/service/node_modules/mariadb/lib/cmd/resultset.js:46:28)
    at PacketInputStream.receivePacketBasic (/srv/service/node_modules/mariadb/lib/io/packet-input-stream.js:104:9)
    at PacketInputStream.onData (/srv/service/node_modules/mariadb/lib/io/packet-input-stream.js:169:20)
    at Socket.emit (node:events:513:28)
    at addChunk (node:internal/streams/readable:315:12)
    at readableAddChunk (node:internal/streams/readable:289:9)
    at Socket.Readable.push (node:internal/streams/readable:228:10)
    at TCP.onStreamRead (node:internal/stream_base_commons:190:23) {
  text: "Duplicate entry 'LUMINATI_PROXY' for key 'proxy'",
  sql: "INSERT INTO proxies (proxy) VALUES (?); - parameters:['LUMINATI_PROXY']",
  fatal: false,
  errno: 1062,
  sqlState: '23000',
  code: 'ER_DUP_ENTRY'
}

Reproduction data: ask @dom_walden

AGueyte set the point value for this task to 1.

Been testing our reprod data for different bugs lately (T339324, T339345, T340178, T341660) and haven't come across these specific issues with the Proxy table.

Looking at the Proxy table in the latest version of SchemaFile.js

CREATE TABLE IF NOT EXISTS proxies (
   pkid BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
   proxy VARBINARY(128) UNIQUE,
   PRIMARY KEY (pkid)
);

By importing the same files over and over again, the values simply do not duplicate into the existing DB. No errors are being raised. ImportData.js covers the cases of duplicate entries and increments the proxies pkid.

The issue am I getting from this reprod data is for the Tunnel table.

Incorrect integer value: '?' for column `test`.`tunnels`.`anonymous` at row 1
sql: INSERT INTO tunnels (operator, type, anonymous) VALUES (?, ?, ?) - parameters:['¿;¸
                                                    ÂÈÞ𢳎','«','񀯚']

  code: 'ER_TRUNCATED_WRONG_VALUE_FOR_FIELD'

More reprod data, if it helps:

More reprod data, if it helps:

Thanks for that, @dom_walden, I was able to replicate the error with the above test data.

  1. I get a duplicated key error on the first import.
  2. I also see new data added the more I import the same file, over and over again, randomly
  3. I get an "out of range" error when I quickly re-initiate and re-import the same file after the first try.

Reproducing the error

  1. When I first import the file, the bug seems to happen after the 195th row (I log each proxy before being added into DB).
"Duplicate entry 'r' for key 'proxy'".

The log in importData.js shows it has stopped for pkid 195 and proxy "ImFhy" as seen below, and while replicating the same steps, it has also stopped at row 189th. In the DB, I do have +200 entries from that first import, and pkid ordered by ASC, shows up to pkid 204.
We do see the 195th pkid being "ImFhy" and a lot more into DB that didn't get log from the import.

Screenshot 2023-07-28 at 10.11.36 AM.png (1×2 px, 1 MB)
Screenshot 2023-07-28 at 10.10.27 AM.png (1×892 px, 413 KB)
  1. I re-import the same file, not being blocked this time. I get +100 new entries. I would re-import, sometimes I would have new entries, and sometimes nothing.

Screenshot 2023-07-28 at 10.18.27 AM.png (810×1 px, 325 KB)

Re-initiating the DB, importing again, and getting the same block at row 195th.

"Duplicate entry 'r' for key 'proxy'".
  1. Reinitiate (quickly) the DB, re-import (quickly after), and get a new error at row 132nd:
"Duplicate entry 'hjLkoKK' for key 'proxy'"

The proxyPkid isn't being created (or at least, not logged), yet, still exists in the DB. I also got an error of "Duplicate entry 'PEydL' for key 'proxy'" at row 131st.

Screenshot 2023-07-28 at 10.29.44 AM.png (1×2 px, 1 MB)
Screenshot 2023-07-28 at 10.33.21 AM.png (626×784 px, 186 KB)

But what I also notice it's the second error always stops due to an "out of range" error.

Screenshot 2023-07-28 at 10.30.26 AM.png (632×2 px, 692 KB)

Routes for investigation:

  • There's a lag between the import and what is actually being saved into DB. Is it coming from the speed of our system?
  • There are a lot of errors happening in the import that might contribute to failing the import and creating lag:
{"name":"data-import","hostname":"fd12dff0eb3d","pid":2269,"level":50,"err":**{"message":"Cannot read properties of undefined (reading 'country')**","name":"TypeError","stack":"TypeError: Cannot read properties of undefined (reading 'country')\n    at Transform.transform [as _transform] (/srv/service/import-data.js:271:55)\n    at runMicrotasks (<anonymous>)\n    at processTicksAndRejections (node:internal/process/task_queues:96:5)"},"ip":"7ab3d130-8221-49bf-90b8-66e282a00eae","msg":"Cannot read properties of undefined (reading 'country')","time":"2023-07-28T13:43:38.241Z","v":0}
{"name":"data-import","hostname":"fd12dff0eb3d","pid":2365,"level":50,"err":{"message":"(conn=553, no: 1264, SQLState: 22003) Out of range value for column 'conc_density' at row 1\nsql: REPLACE INTO\n            actor_data (ip,org,client_count,types,conc_geohash,conc_city,conc_state,conc_country,conc_skew,conc_density,countries,location_country,risks)\n        VALUES\n            (?,?,?,?,?,?,?,?,?,?,?,?,?) - parameters:['4e006d00-98dd-4a46-...]"
AGueyte changed the point value for this task from 1 to 3.Jul 28 2023, 2:08 PM

More reprod data, if it helps:

Further testing with the same testing files. In this testing file, there are around 5000 entries. The "type": "R" value is indeed present 9 times in the file. 4 times in small caps and 5 times in capital letters.

"type": "R"
"type": "r"

Results:
Import with less than 1000 rows does not raise duplicate import error or any other type of error despite having multiple occurrences of the value "R" in the file. Yet, no error is raised but the value is also not saved into DB.
It seems that it's not the values themselves or the way we import that is causing issues but the capacity of our system in taking in so many errors, entries to be saved into DB, and logging errors, at the same time.

Test 1: Small file - No error - No value "r" added
Running: docker compose exec web node init-db.js
Running: docker compose exec web node import-data.js ./tmp/test-1.json.gz
Importing the 533 first rows, with two occurrences of "type": "R", one in small-cap and one in capital letter.
No error when importing.
Running: select * from proxies ORDER by pkid ASC;
In the DB, 4 rows were added, and no entry of value "r" or "R" was added.

Test 2: 2500 rows - Duplicate error - One value 'r' added in DB
Importing the 2500 first rows, with 9 occurrences of "type": "R"
ERROR: Duplicate entry 'r' for key 'proxy'. Import stopped at pkid 189.
In the DB, 204 entries are in the proxies table. Only one entry of value "r" was added with pkid 182

Test 3: 2300 rows - Duplicate error - One value 'r' added in DB
Importing the 2300 first rows, with 8 occurrences of "type": "R"
ERROR: Duplicate entry 'r' for key 'proxy'. Import stopped at pkid 189.
In the DB, 204 entries are in the proxies table. Only one entry of value "r" was added with pkid 182.

Test 4: 5000 rows - Duplicate error - One value 'r' added in DB
Importing the the 5000+ rows, but moving the 9 occurrences of "type": "R" within the first 800 rows.
ERROR: Duplicate entry 'r' for key 'proxy'. Import stopped at pkid 189.
In the DB, 204 entries are in the proxies table. Only one entry of value "r" was added with pkid 182.

Test 5: 1000 rows - No error - No value "r" added
Importing the 1000 first rows, with the 10 occurrences of "type": "R" within the first 800 rows.
No import error
In the DB, 47 entries are in the proxies table. No entry with value "r" or "R".

Test 6: 5000 rows - Duplicate error - One value 'r' added in DB
Importing the the 5000+ rows, but moving the 9 occurrences of "type": "R" within the first 20 rows.
ERROR: Duplicate entry 'r' for key 'proxy'. Import stopped at pkid 189.
In the DB, 204 entries are in the proxies table. Only one entry of value "r" was added with pkid 182.

Test 7: 2500 rows - Duplicate error - One value 'r' added in DB
Importing the the 2500 rows, but moving the 9 occurrences of "type": "R" within the first 20 rows.
ERROR: Duplicate entry 'r' for key 'proxy'. Import stopped at pkid 191.
In the DB, 204 entries are in the proxies table. Only one entry of value "r" was added with pkid 182.

Test 8: 1000 rows - No error - No value "r" added
Importing the the 1000 rows, but moving the 9 occurrences of "type": "R" within the first 20 rows.
No error
In the DB, 47 entries are in the proxies table. No entry with value "r" or "R".

@AGueyte Just wanted to check what's changed on this task since we discussed it in Engineering? Thanks

@AGueyte Just wanted to check what's changed on this task since we discussed it in Engineering? Thanks

Hi, nothing since we talked about it in Engineering meeting.
If I recall correctly, we decided to close this ticket as the issue won't be happening with T341122.

Thanks @AGueyte - happy to let @STran decide what to do with this task.

I don't think this is happening anymore now that we import the properties independently.