Page MenuHomePhabricator

Migrate definitions storage from the legacy to new strategy
Closed, ResolvedPublic

Description

Migrate the definitions use-case from legacy storage (Cassandra 2.x), to the new strategy and cluster (Cassandra 3.x).

Event Timeline

Eevans created this task.Oct 31 2017, 5:00 PM

The following script outputs a YAML file containing CQL literals for the queries necessary to create these tables.

1"use strict";
2
3
4const crypto = require('crypto');
5const colors = require('colors/safe');
6const P = require('bluebird');
7const yargs = require('yargs');
8
9const green = colors.green;
10const red = colors.red;
11const yellow = colors.yellow;
12
13
14function hashKey(key) {
15 return new crypto.Hash('sha1')
16 .update(key)
17 .digest()
18 .toString('base64')
19 // Replace [+/] from base64 with _ (illegal in Cassandra)
20 .replace(/[+/]/g, '_')
21 // Remove base64 padding, has no entropy
22 .replace(/=+$/, '');
23}
24
25
26function getValidPrefix(key) {
27 const prefixMatch = /^[a-zA-Z0-9_]+/.exec(key);
28 if (prefixMatch) {
29 return prefixMatch[0];
30 } else {
31 return '';
32 }
33}
34
35
36function makeValidKey(key, length) {
37 const origKey = key;
38 key = key.replace(/_/g, '__')
39 .replace(/\./g, '_');
40 if (!/^[a-zA-Z0-9_]+$/.test(key)) {
41 // Create a new 28 char prefix
42 const validPrefix = getValidPrefix(key).substr(0, length * 2 / 3);
43 return validPrefix + hashKey(origKey).substr(0, length - validPrefix.length);
44 } else if (key.length > length) {
45 return key.substr(0, length * 2 / 3) + hashKey(origKey).substr(0, length / 3);
46 } else {
47 return key;
48 }
49}
50
51
52function keyspaceName(name, table) {
53 const reversedName = name.toLowerCase().split('.').reverse().join('.');
54 const prefix = makeValidKey(reversedName, Math.max(26, 48 - table.length - 3));
55 // 6 chars _hash_ to prevent conflicts between domains & table names
56 const res = `${prefix}_T_${makeValidKey(table, 48 - prefix.length - 3)}`;
57 return res;
58}
59
60
61function cassID(name) {
62 if (/^[a-zA-Z0-9_]+$/.test(name)) {
63 return `"${name}"`;
64 } else {
65 return `"${name.replace(/"/g, '""')}"`;
66 }
67}
68
69const tables = {
70 'term.definition-ng': 'blob'
71};
72
73const storages = [
74 // 'enwiki',
75 // 'commons',
76 // 'wikipedia',
77 'others'
78];
79
80const qKs = `CREATE KEYSPACE IF NOT EXISTS <keyspace> WITH replication = {'class': 'NetworkTopologyStrategy', 'codfw': '3', 'eqiad': '3'} AND durable_writes = true;`;
81
82const qMeta = `CREATE TABLE IF NOT EXISTS <keyspace>.meta (
83 key text PRIMARY KEY,
84 value text
85) WITH bloom_filter_fp_chance = 0.1
86 AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}
87 AND comment = ''
88 AND compaction = {'class': 'org.apache.cassandra.db.compaction.LeveledCompactionStrategy'}
89 AND compression = {'chunk_length_in_kb': '64', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}
90 AND crc_check_chance = 1.0
91 AND dclocal_read_repair_chance = 0.1
92 AND default_time_to_live = 0
93 AND gc_grace_seconds = 864000
94 AND max_index_interval = 2048
95 AND memtable_flush_period_in_ms = 0
96 AND min_index_interval = 128
97 AND read_repair_chance = 0.0
98 AND speculative_retry = '99PERCENTILE';`;
99
100const qData = `CREATE TABLE IF NOT EXISTS <keyspace>.data (
101 "_domain" text,
102 key text,
103 rev int,
104 tid timeuuid,
105 "content-location" text,
106 "content-type" text,
107 tags set<text>,
108 value <type>,
109 PRIMARY KEY (("_domain", key), rev, tid)
110) WITH CLUSTERING ORDER BY (rev DESC, tid DESC)
111 AND bloom_filter_fp_chance = 0.01
112 AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}
113 AND comment = ''
114 AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32', 'min_threshold': '4'}
115 AND compression = {'chunk_length_in_kb': '256', 'class': 'org.apache.cassandra.io.compress.DeflateCompressor'}
116 AND crc_check_chance = 1.0
117 AND dclocal_read_repair_chance = 0.1
118 AND default_time_to_live = 0
119 AND gc_grace_seconds = 86400
120 AND max_index_interval = 2048
121 AND memtable_flush_period_in_ms = 0
122 AND min_index_interval = 128
123 AND read_repair_chance = 0.0
124 AND speculative_retry = '99PERCENTILE';`;
125
126const qDataRevision = `CREATE TABLE IF NOT EXISTS <keyspace>.data (
127 "_domain" text,
128 key text,
129 ts timestamp,
130 rev int,
131 PRIMARY KEY (("_domain", key), ts)
132) WITH CLUSTERING ORDER BY (ts DESC)
133 AND bloom_filter_fp_chance = 0.1
134 AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}
135 AND comment = ''
136 AND compaction = {'class': 'org.apache.cassandra.db.compaction.LeveledCompactionStrategy'}
137 AND compression = {'chunk_length_in_kb': '64', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}
138 AND crc_check_chance = 1.0
139 AND dclocal_read_repair_chance = 0.1
140 AND default_time_to_live = 864000
141 AND gc_grace_seconds = 864000
142 AND max_index_interval = 2048
143 AND memtable_flush_period_in_ms = 0
144 AND min_index_interval = 128
145 AND read_repair_chance = 0.0
146 AND speculative_retry = '99PERCENTILE';`;
147
148const qDataRender = `CREATE TABLE IF NOT EXISTS <keyspace>.data (
149 "_domain" text,
150 key text,
151 rev int,
152 ts timestamp,
153 tid timeuuid,
154 PRIMARY KEY (("_domain", key), rev, ts)
155) WITH CLUSTERING ORDER BY (rev DESC, ts DESC)
156 AND bloom_filter_fp_chance = 0.1
157 AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}
158 AND comment = ''
159 AND compaction = {'class': 'org.apache.cassandra.db.compaction.LeveledCompactionStrategy'}
160 AND compression = {'chunk_length_in_kb': '64', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}
161 AND crc_check_chance = 1.0
162 AND dclocal_read_repair_chance = 0.1
163 AND default_time_to_live = 864000
164 AND gc_grace_seconds = 864000
165 AND max_index_interval = 2048
166 AND memtable_flush_period_in_ms = 0
167 AND min_index_interval = 128
168 AND read_repair_chance = 0.0
169 AND speculative_retry = '99PERCENTILE';`;
170
171const qKvData = `CREATE TABLE IF NOT EXISTS <keyspace>.data (
172 "_domain" text,
173 key text,
174 tid timeuuid,
175 headers blob,
176 value <type>,
177 PRIMARY KEY (("_domain", key))
178) WITH bloom_filter_fp_chance = 0.01
179 AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}
180 AND comment = ''
181 AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32', 'min_threshold': '4'}
182 AND compression = {'chunk_length_in_kb': '64', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}
183 AND crc_check_chance = 1.0
184 AND dclocal_read_repair_chance = 0.1
185 AND default_time_to_live = 0
186 AND gc_grace_seconds = 86400
187 AND max_index_interval = 2048
188 AND memtable_flush_period_in_ms = 0
189 AND min_index_interval = 128
190 AND read_repair_chance = 0.0
191 AND speculative_retry = '99PERCENTILE';`;
192
193
194const argv = yargs.usage('Usage: $0 [options]')
195 .options('h', {alias: 'help'})
196 .argv;
197
198if (argv.help) {
199 yargs.showHelp();
200 process.exit(0);
201}
202
203
204return P.each(storages, (storage) => {
205 return P.each(Object.keys(tables), (table) => {
206 const formatted = (query, keyspace, type) => {
207 let output = query.replace('<keyspace>', keyspace);
208 if (type)
209 output = output.replace('<type>', type);
210 output.split('\n').forEach((line) => {
211 console.log(` ${line}`);
212 });
213 };
214
215 const keyspace = cassID(keyspaceName(storage, table));
216
217 console.log(`${keyspace.replace(/"/g, '')}:`);
218 console.log(` statement: |`);
219 formatted(qKs, keyspace);
220 console.log(`${keyspace.replace(/"/g, '')}_meta:`);
221 console.log(` statement: |`);
222 formatted(qMeta, keyspace);
223 console.log(`${keyspace.replace(/"/g, '')}_data:`);
224 console.log(` statement: |`);
225 formatted(qKvData, keyspace, tables[table]);
226 console.log('\n# -----\n');
227 });
228});

The output looks like the following.

1others_T_term_definitioniTACmxtqcTMuXLM4yrvjm4H9:
2 statement: |
3 CREATE KEYSPACE IF NOT EXISTS "others_T_term_definitioniTACmxtqcTMuXLM4yrvjm4H9" WITH replication = {'class': 'NetworkTopologyStrategy', 'codfw': '3', 'eqiad': '3'} AND durable_writes = true;
4others_T_term_definitioniTACmxtqcTMuXLM4yrvjm4H9_meta:
5 statement: |
6 CREATE TABLE IF NOT EXISTS "others_T_term_definitioniTACmxtqcTMuXLM4yrvjm4H9".meta (
7 key text PRIMARY KEY,
8 value text
9 ) WITH bloom_filter_fp_chance = 0.1
10 AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}
11 AND comment = ''
12 AND compaction = {'class': 'org.apache.cassandra.db.compaction.LeveledCompactionStrategy'}
13 AND compression = {'chunk_length_in_kb': '64', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}
14 AND crc_check_chance = 1.0
15 AND dclocal_read_repair_chance = 0.1
16 AND default_time_to_live = 0
17 AND gc_grace_seconds = 864000
18 AND max_index_interval = 2048
19 AND memtable_flush_period_in_ms = 0
20 AND min_index_interval = 128
21 AND read_repair_chance = 0.0
22 AND speculative_retry = '99PERCENTILE';
23others_T_term_definitioniTACmxtqcTMuXLM4yrvjm4H9_data:
24 statement: |
25 CREATE TABLE IF NOT EXISTS "others_T_term_definitioniTACmxtqcTMuXLM4yrvjm4H9".data (
26 "_domain" text,
27 key text,
28 tid timeuuid,
29 headers blob,
30 value blob,
31 PRIMARY KEY (("_domain", key))
32 ) WITH bloom_filter_fp_chance = 0.01
33 AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}
34 AND comment = ''
35 AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32', 'min_threshold': '4'}
36 AND compression = {'chunk_length_in_kb': '64', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}
37 AND crc_check_chance = 1.0
38 AND dclocal_read_repair_chance = 0.1
39 AND default_time_to_live = 0
40 AND gc_grace_seconds = 86400
41 AND max_index_interval = 2048
42 AND memtable_flush_period_in_ms = 0
43 AND min_index_interval = 128
44 AND read_repair_chance = 0.0
45 AND speculative_retry = '99PERCENTILE';
46
47# -----
48

Which we can apply using mkschema.

More eyes before applying these changes would be much appreciated!

Mentioned in SAL (#wikimedia-operations) [2017-11-07T11:42:46Z] <mobrovac> restbase truncating cassandra 2 non-WP tables for T179420

Mentioned in SAL (#wikimedia-operations) [2017-11-07T11:42:46Z] <mobrovac> restbase truncating cassandra 2 non-WP tables for T179420

Copy/paste fail ... Not related to this task at all.

Mentioned in SAL (#wikimedia-operations) [2017-11-07T14:38:43Z] <mobrovac> restbase creating wiktionary definition schemas for T179420

Thank you @Eevans for the schemas. I created them, with the exception that value needs to be of type text because we are storing the stringified JSON object returned by MCS.

I guess we can go with https://github.com/wikimedia/restbase/pull/896 right away without creating a proxy. The endpoint is quite low-volume, so it's ok if we just recreate everything.

Mentioned in SAL (#wikimedia-operations) [2017-11-07T15:02:39Z] <mobrovac@tin> Started deploy [restbase/deploy@c5dd1e2]: Switch wiktionary definitions to use the next-gen storage - T179420

Mentioned in SAL (#wikimedia-operations) [2017-11-07T15:10:31Z] <mobrovac@tin> Finished deploy [restbase/deploy@c5dd1e2]: Switch wiktionary definitions to use the next-gen storage - T179420 (duration: 07m 52s)

Mentioned in SAL (#wikimedia-operations) [2017-11-07T15:22:54Z] <mobrovac@tin> Started deploy [restbase/deploy@eab2948]: revert definition switch, wrong schema - T179420

Mentioned in SAL (#wikimedia-operations) [2017-11-07T15:29:40Z] <mobrovac@tin> Finished deploy [restbase/deploy@eab2948]: revert definition switch, wrong schema - T179420 (duration: 06m 46s)

Mentioned in SAL (#wikimedia-operations) [2017-11-07T15:37:16Z] <urandom> T179420: recreating wiktionary definition schemas

Mentioned in SAL (#wikimedia-operations) [2017-11-08T16:02:30Z] <mobrovac@tin> Started deploy [restbase/deploy@c5dd1e2]: Switch wiktionary definitions to use the next-gen storage, take 2 - T179420

Mentioned in SAL (#wikimedia-operations) [2017-11-08T16:02:43Z] <mobrovac@tin> Finished deploy [restbase/deploy@c5dd1e2]: Switch wiktionary definitions to use the next-gen storage, take 2 - T179420 (duration: 00m 13s)

Mentioned in SAL (#wikimedia-operations) [2017-11-08T16:04:13Z] <mobrovac@tin> Started deploy [restbase/deploy@c5dd1e2]: Switch wiktionary definitions to use the next-gen storage, take 2b - T179420

Mentioned in SAL (#wikimedia-operations) [2017-11-08T16:11:34Z] <mobrovac@tin> Finished deploy [restbase/deploy@c5dd1e2]: Switch wiktionary definitions to use the next-gen storage, take 2b - T179420 (duration: 07m 22s)

mobrovac closed this task as Resolved.Nov 8 2017, 4:12 PM
mobrovac claimed this task.
mobrovac edited projects, added Services (done); removed Services (doing).

RESTBase is now using Cassandra 3 for definitions. Resolving.