Page MenuHomePhabricator

Production annotations returning from database as binary strings
Closed, ResolvedPublicBUG REPORT

Description

Something, and I'm suspicious of MariaDB table settings, is causing annotation data added to the production database to present as binary strings (b'...') when fetched back again via the django ORM:

Python 3.7.3 (default, Jan 22 2021, 20:04:44)
[GCC 8.3.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
(InteractiveConsole)
>>> from toolhub.apps.toolinfo.models import Tool
>>> admin = Tool.objects.get(name="toolforge-admin")
>>> admin.annotations.icon
b'https://commons.wikimedia.org/wiki/File:Toolforge_logo.svg'
>>> admin.annotations.icon = "https://commons.wikimedia.org/wiki/File:Toolforge_logo.svg"
>>> admin.annotations.icon
'https://commons.wikimedia.org/wiki/File:Toolforge_logo.svg'
>>> admin.save()
2022-04-26T17:54:26Z [none] py.warnings WARNING: /opt/lib/poetry/toolhub-2uZo5AhP-py3.7/lib/python3.7/site-packages/elasticsearch/connection/http_urllib3.py:193: UserWarning: Connecting to https://host.docker.internal:19200 using SSL with verify_certs=False is insecure.
  % self.host

Traceback (most recent call last):
  File "/opt/lib/poetry/toolhub-2uZo5AhP-py3.7/lib/python3.7/site-packages/elasticsearch/serializer.py", line 121, in dumps
    data, default=self.default, ensure_ascii=False, separators=(",", ":")
  File "/usr/lib/python3.7/json/__init__.py", line 238, in dumps
    **kw).encode(obj)
  File "/usr/lib/python3.7/json/encoder.py", line 199, in encode
    chunks = self.iterencode(o, _one_shot=True)
  File "/usr/lib/python3.7/json/encoder.py", line 257, in iterencode
    return _iterencode(o, 0)
  File "/opt/lib/poetry/toolhub-2uZo5AhP-py3.7/lib/python3.7/site-packages/elasticsearch_dsl/serializer.py", line 11, in default
    return super(AttrJSONSerializer, self).default(data)
  File "/opt/lib/poetry/toolhub-2uZo5AhP-py3.7/lib/python3.7/site-packages/elasticsearch/serializer.py", line 106, in default
    raise TypeError("Unable to serialize %r (type: %s)" % (data, type(data)))
TypeError: Unable to serialize b'web app' (type: <class 'bytes'>)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<console>", line 1, in <module>
  File "/opt/lib/poetry/toolhub-2uZo5AhP-py3.7/lib/python3.7/site-packages/safedelete/models.py", line 107, in save
    super(SafeDeleteModel, self).save(**kwargs)
  File "/opt/lib/poetry/toolhub-2uZo5AhP-py3.7/lib/python3.7/site-packages/django/db/models/base.py", line 740, in save
    force_update=force_update, update_fields=update_fields)
  File "/opt/lib/poetry/toolhub-2uZo5AhP-py3.7/lib/python3.7/site-packages/django/db/models/base.py", line 789, in save_base
    update_fields=update_fields, raw=raw, using=using,
  File "/opt/lib/poetry/toolhub-2uZo5AhP-py3.7/lib/python3.7/site-packages/django/dispatch/dispatcher.py", line 182, in send
    for receiver in self._live_receivers(sender)
  File "/opt/lib/poetry/toolhub-2uZo5AhP-py3.7/lib/python3.7/site-packages/django/dispatch/dispatcher.py", line 182, in <listcomp>
    for receiver in self._live_receivers(sender)
  File "/srv/app/toolhub/apps/search/signals.py", line 51, in handle_save
    super().handle_save(sender, instance, **kwargs)
  File "/opt/lib/poetry/toolhub-2uZo5AhP-py3.7/lib/python3.7/site-packages/django_elasticsearch_dsl/signals.py", line 57, in handle_save
    registry.update(instance)
  File "/opt/lib/poetry/toolhub-2uZo5AhP-py3.7/lib/python3.7/site-packages/django_elasticsearch_dsl/registries.py", line 141, in update
    doc().update(instance, **kwargs)
  File "/opt/lib/poetry/toolhub-2uZo5AhP-py3.7/lib/python3.7/site-packages/django_elasticsearch_dsl/documents.py", line 209, in update
    **kwargs
  File "/opt/lib/poetry/toolhub-2uZo5AhP-py3.7/lib/python3.7/site-packages/django_elasticsearch_dsl/documents.py", line 190, in _bulk
    return self.bulk(*args, **kwargs)
  File "/opt/lib/poetry/toolhub-2uZo5AhP-py3.7/lib/python3.7/site-packages/django_elasticsearch_dsl/documents.py", line 147, in bulk
    return bulk(client=self._get_connection(), actions=actions, **kwargs)
  File "/opt/lib/poetry/toolhub-2uZo5AhP-py3.7/lib/python3.7/site-packages/elasticsearch/helpers/actions.py", line 314, in bulk
    for ok, item in streaming_bulk(client, actions, *args, **kwargs):
  File "/opt/lib/poetry/toolhub-2uZo5AhP-py3.7/lib/python3.7/site-packages/elasticsearch/helpers/actions.py", line 226, in streaming_bulk
    actions, chunk_size, max_chunk_bytes, client.transport.serializer
  File "/opt/lib/poetry/toolhub-2uZo5AhP-py3.7/lib/python3.7/site-packages/elasticsearch/helpers/actions.py", line 84, in _chunk_actions
    data = serializer.dumps(data)
  File "/opt/lib/poetry/toolhub-2uZo5AhP-py3.7/lib/python3.7/site-packages/elasticsearch/serializer.py", line 124, in dumps
    raise SerializationError(data, e)
elasticsearch.exceptions.SerializationError: ({'annotations': {'wikidata_qid': None, 'deprecated': False, 'replaced_by': None, 'experimental': False, 'for_wikis': [], 'icon': 'https://commons.wikimedia.org/wiki/File:Toolforge_logo.svg', 'available_ui_languages': [], 'tool_type': b'web app', 'api_url': None, 'developer_docs_url': [], 'user_docs_url': [], 'feedback_url': [], 'privacy_policy_url': [], 'translate_url': None, 'bugtracker_url': None}, 'created_by': {'id': 9, 'username': 'Toolhub'}, 'modified_by': {'id': 3, 'username': 'BDavis (WMF)'}, 'x_merged_ui_lang': None, 'x_merged_wiki': None, 'x_merged_type': None, 'name': 'toolforge-admin', 'title': 'Toolforge homepage', 'description': 'This tool serves the Toolforge landing page, and performs a number of scheduled processes to maintain the project metadata.\r\n\r\nMaintainers of this tool have administrative access to the tools project infrastructure, and are good people to contact if you are having issues.', 'url': 'https://admin.toolforge.org/', 'keywords': ['toolforge'], 'author': [{'name': 'Bryan Davis', 'wiki_username': None, 'developer_username': None, 'email': None, 'url': None}, {'name': 'Marc-André Pelletier', 'wiki_username': None, 'developer_username': None, 'email': None, 'url': None}], 'repository': 'https://phabricator.wikimedia.org/source/tool-admin-web/', 'subtitle': None, 'openhub_id': None, 'url_alternates': [], 'bot_username': None, 'deprecated': False, 'replaced_by': None, 'experimental': False, 'for_wikis': [], 'icon': None, 'license': None, 'sponsor': [], 'available_ui_languages': [], 'technology_used': [], 'tool_type': None, 'api_url': None, 'developer_docs_url': [], 'user_docs_url': [], 'feedback_url': [], 'privacy_policy_url': [], 'translate_url': None, 'bugtracker_url': None, '_schema': None, '_language': 'en', 'origin': 'crawler', 'created_date': datetime.datetime(2021, 10, 10, 20, 1, 14, 29064, tzinfo=<UTC>), 'modified_date': datetime.datetime(2022, 4, 26, 17, 54, 26, 78562, tzinfo=<UTC>)}, TypeError("Unable to serialize b'web app' (type: <class 'bytes'>)"))
>>> admin = Tool.objects.get(name="toolforge-admin")
>>> admin.annotations.icon
b'https://commons.wikimedia.org/wiki/File:Toolforge_logo.svg'

Event Timeline

bd808 changed the task status from Open to In Progress.Apr 26 2022, 6:00 PM
bd808 claimed this task.
bd808 triaged this task as High priority.

Confirmed that this is primarily a failure caused by creating the table using Django migrations and that not overriding all of the default config for the server:

> show create table toolinfo_annotations\G
*************************** 1. row ***************************
       Table: toolinfo_annotations
Create Table: CREATE TABLE `toolinfo_annotations` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `deleted` datetime(6) DEFAULT NULL,
  `tool_id` int(11) NOT NULL,
  `wikidata_qid` varbinary(32) DEFAULT NULL,
  `api_url` longblob DEFAULT NULL,
  `available_ui_languages` longblob NOT NULL,
  `bugtracker_url` longblob DEFAULT NULL,
  `deprecated` tinyint(1) NOT NULL,
  `developer_docs_url` longblob NOT NULL,
  `experimental` tinyint(1) NOT NULL,
  `feedback_url` longblob NOT NULL,
  `for_wikis` longblob NOT NULL,
  `icon` varbinary(2047) DEFAULT NULL,
  `privacy_policy_url` longblob NOT NULL,
  `replaced_by` longblob DEFAULT NULL,
  `tool_type` varbinary(32) DEFAULT NULL,
  `translate_url` longblob DEFAULT NULL,
  `user_docs_url` longblob NOT NULL,
  PRIMARY KEY (`id`),
  UNIQUE KEY `tool_id` (`tool_id`),
  CONSTRAINT `toolinfo_annotations_tool_id_730edccf_fk_toolinfo_tool_id` FOREIGN KEY (`tool_id`) REFERENCES `toolinfo_tool` (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1815 DEFAULT CHARSET=binary

This table really should be using CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci.

toolhub_admin@m5-master.eqiad.wmnet(toolhub)> ALTER DATABASE toolhub CHARACTER SET='utf8mb4' COLLATE='utf8mb4_unicode_ci';
Query OK, 1 row affected (0.002 sec)

toolhub_admin@m5-master.eqiad.wmnet(toolhub)> ALTER TABLE toolinfo_annotations CONVERT TO CHARACTER SET 'utf8mb4' COLLATE 'utf8mb4_unicode_ci';
Query OK, 0 rows affected (0.003 sec)
Records: 0  Duplicates: 0  Warnings: 0

toolhub_admin@m5-master.eqiad.wmnet(toolhub)> show create table toolinfo_annotations\G
*************************** 1. row ***************************
       Table: toolinfo_annotations
Create Table: CREATE TABLE `toolinfo_annotations` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `deleted` datetime(6) DEFAULT NULL,
  `tool_id` int(11) NOT NULL,
  `wikidata_qid` varbinary(32) DEFAULT NULL,
  `api_url` longblob DEFAULT NULL,
  `available_ui_languages` longblob NOT NULL,
  `bugtracker_url` longblob DEFAULT NULL,
  `deprecated` tinyint(1) NOT NULL,
  `developer_docs_url` longblob NOT NULL,
  `experimental` tinyint(1) NOT NULL,
  `feedback_url` longblob NOT NULL,
  `for_wikis` longblob NOT NULL,
  `icon` varbinary(2047) DEFAULT NULL,
  `privacy_policy_url` longblob NOT NULL,
  `replaced_by` longblob DEFAULT NULL,
  `tool_type` varbinary(32) DEFAULT NULL,
  `translate_url` longblob DEFAULT NULL,
  `user_docs_url` longblob NOT NULL,
  PRIMARY KEY (`id`),
  UNIQUE KEY `tool_id` (`tool_id`),
  CONSTRAINT `toolinfo_annotations_tool_id_730edccf_fk_toolinfo_tool_id` FOREIGN KEY (`tool_id`) REFERENCES `toolinfo_tool` (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1815 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
1 row in set (0.005 sec)

toolhub_admin@m5-master.eqiad.wmnet(toolhub)> ALTER TABLE toolinfo_annotations
    -> MODIFY `wikidata_qid` varchar(32) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
    -> MODIFY `api_url` longtext COLLATE utf8mb4_unicode_ci DEFAULT NULL,
    -> MODIFY `available_ui_languages` longtext COLLATE utf8mb4_unicode_ci NOT NULL,
    -> MODIFY `bugtracker_url` longtext COLLATE utf8mb4_unicode_ci DEFAULT NULL,    -> MODIFY `developer_docs_url` longtext COLLATE utf8mb4_unicode_ci NOT NULL,    -> MODIFY `feedback_url` longtext COLLATE utf8mb4_unicode_ci NOT NULL,
    -> MODIFY `for_wikis` longtext COLLATE utf8mb4_unicode_ci NOT NULL,
    -> MODIFY `icon` varchar(2047) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
    -> MODIFY `privacy_policy_url` longtext COLLATE utf8mb4_unicode_ci NOT NULL,    -> MODIFY `replaced_by` longtext COLLATE utf8mb4_unicode_ci DEFAULT NULL,
    -> MODIFY `tool_type` varchar(32) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
    -> MODIFY `translate_url` longtext COLLATE utf8mb4_unicode_ci DEFAULT NULL,
    -> MODIFY `user_docs_url` longtext COLLATE utf8mb4_unicode_ci NOT NULL;
Query OK, 1814 rows affected (0.026 sec)
Records: 1814  Duplicates: 0  Warnings: 0

toolhub_admin@m5-master.eqiad.wmnet(toolhub)> show create table toolinfo_annotations\G
*************************** 1. row ***************************
       Table: toolinfo_annotations
Create Table: CREATE TABLE `toolinfo_annotations` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `deleted` datetime(6) DEFAULT NULL,
  `tool_id` int(11) NOT NULL,
  `wikidata_qid` varchar(32) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
  `api_url` longtext COLLATE utf8mb4_unicode_ci DEFAULT NULL,
  `available_ui_languages` longtext COLLATE utf8mb4_unicode_ci NOT NULL,
  `bugtracker_url` longtext COLLATE utf8mb4_unicode_ci DEFAULT NULL,
  `deprecated` tinyint(1) NOT NULL,
  `developer_docs_url` longtext COLLATE utf8mb4_unicode_ci NOT NULL,
  `experimental` tinyint(1) NOT NULL,
  `feedback_url` longtext COLLATE utf8mb4_unicode_ci NOT NULL,
  `for_wikis` longtext COLLATE utf8mb4_unicode_ci NOT NULL,
  `icon` varchar(2047) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
  `privacy_policy_url` longtext COLLATE utf8mb4_unicode_ci NOT NULL,
  `replaced_by` longtext COLLATE utf8mb4_unicode_ci DEFAULT NULL,
  `tool_type` varchar(32) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
  `translate_url` longtext COLLATE utf8mb4_unicode_ci DEFAULT NULL,
  `user_docs_url` longtext COLLATE utf8mb4_unicode_ci NOT NULL,
  PRIMARY KEY (`id`),
  UNIQUE KEY `tool_id` (`tool_id`),
  CONSTRAINT `toolinfo_annotations_tool_id_730edccf_fk_toolinfo_tool_id` FOREIGN KEY (`tool_id`) REFERENCES `toolinfo_tool` (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1815 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
1 row in set (0.002 sec)
Python 3.7.3 (default, Jan 22 2021, 20:04:44)
[GCC 8.3.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
(InteractiveConsole)
>>> from toolhub.apps.toolinfo.models import Tool
>>> admin = Tool.objects.get(name="toolforge-admin")
>>> admin.annotations.icon
'https://commons.wikimedia.org/wiki/File:Toolforge_logo.svg'