To build an incremental version of the mediawiki-history dataset the analytics-engineering team needs two new event-types to flow onto HDFS: user_create and user_rename.
Here is a draft of schemas that would work:
title: mediawiki/user/create
description: Represents a MW User Create event
$id: /mediawiki/user/create/1.0.0
$schema: 'https://json-schema.org/draft-07/schema#'
type: object
required:
- $schema
- database
- meta
- user_id
- user_text
- user_groups
_ user_is_bot
- creation_mode
properties:
$schema:
type: string
description: >
A URI identifying the JSONSchema for this event. This should match an
schema's $id in a schema repository. E.g. /schema_name/1.0.0
meta:
type: object
required:
- id
- dt
- stream
properties:
uri:
type: string
format: uri-reference
maxLength: 8192
description: Unique URI identifying the event or entity
request_id:
type: string
description: Unique ID of the request that caused the event
id:
type: string
pattern: '^[a-fA-F0-9]{8}(-[a-fA-F0-9]{4}){3}-[a-fA-F0-9]{12}$'
maxLength: 36
description: Unique ID of this event
dt:
type: string
format: date-time
maxLength: 128
description: 'Event datetime, in ISO-8601 format'
domain:
type: string
description: Domain the event or entity pertains to
minLength: 1
stream:
type: string
description: Name of the stream/queue/dataset that this event belongs in
minLength: 1
database:
description: The name of the wiki database this event entity belongs to.
type: string
minLength: 1
performer:
description: Represents the user that performed this change.
type: object
required:
- user_text
- user_groups
- user_is_bot
properties:
user_id:
description: >
The user id that performed this change. This is optional, and will
not be present for anonymous users.
type: integer
user_text:
description: The text representation of the user that performed this change.
type: string
minLength: 1
user_groups:
description: 'A list of the groups this user belongs to. E.g. bot, sysop etc.'
type: array
items:
type: string
minLength: 1
user_is_bot:
description: >
True if this user is considered to be a bot at the time of this event.
This is checked via the $user->isBot() method, which considers both
user_groups and user permissions.
type: boolean
user_registration_dt:
description: >
The datetime of the user account registration. Not present for
anonymous users or if missing in the MW database.
type: string
format: date-time
maxLength: 128
user_edit_count:
description: >
The number of edits this user has made at the time of this event. Not
present for anonymous users.
type: integer
minimum: 0
comment:
description: The comment left by the user that performed this change.
type: string
user_id:
description: >
The user id of the created user.
type: integer
user_text:
description: The text representation of the created user.
type: string
minLength: 1
user_groups:
description: |
A list of the groups the created user belongs to. E.g. bot, sysop etc.
type: array
items:
type: string
minLength: 1
user_is_bot:
description: >
True if this user is considered to be a bot at the time of this event.
This is checked via the $user->isBot() method, which considers both
user_groups and user permissions.
type: boolean
user_creation_mode:
description: |
The description of how the user has been created. E.g. 'self-created', 'peer-created', 'system-created'
type: string
minLength: 1and:
title: mediawiki/user/rename
description: Represents a MW User Rename event
$id: /mediawiki/user/create/1.0.0
$schema: 'https://json-schema.org/draft-07/schema#'
type: object
required:
- $schema
- database
- meta
- user_id
- user_text
- user_groups
_ user_is_bot
- prior_state
properties:
$schema:
type: string
description: >
A URI identifying the JSONSchema for this event. This should match an
schema's $id in a schema repository. E.g. /schema_name/1.0.0
meta:
type: object
required:
- id
- dt
- stream
properties:
uri:
type: string
format: uri-reference
maxLength: 8192
description: Unique URI identifying the event or entity
request_id:
type: string
description: Unique ID of the request that caused the event
id:
type: string
pattern: '^[a-fA-F0-9]{8}(-[a-fA-F0-9]{4}){3}-[a-fA-F0-9]{12}$'
maxLength: 36
description: Unique ID of this event
dt:
type: string
format: date-time
maxLength: 128
description: 'Event datetime, in ISO-8601 format'
domain:
type: string
description: Domain the event or entity pertains to
minLength: 1
stream:
type: string
description: Name of the stream/queue/dataset that this event belongs in
minLength: 1
database:
description: The name of the wiki database this event entity belongs to.
type: string
minLength: 1
performer:
description: Represents the user that performed this change.
type: object
required:
- user_text
- user_groups
- user_is_bot
properties:
user_id:
description: >
The user id that performed this change. This is optional, and will
not be present for anonymous users.
type: integer
user_text:
description: The text representation of the user that performed this change.
type: string
minLength: 1
user_groups:
description: 'A list of the groups this user belongs to. E.g. bot, sysop etc.'
type: array
items:
type: string
minLength: 1
user_is_bot:
description: >
True if this user is considered to be a bot at the time of this event.
This is checked via the $user->isBot() method, which considers both
user_groups and user permissions.
type: boolean
user_registration_dt:
description: >
The datetime of the user account registration. Not present for
anonymous users or if missing in the MW database.
type: string
format: date-time
maxLength: 128
user_edit_count:
description: >
The number of edits this user has made at the time of this event. Not
present for anonymous users.
type: integer
minimum: 0
comment:
description: The comment left by the user that performed this change.
type: string
user_id:
description: >
The user id of the renamed user.
type: integer
user_text:
description: The text representation of the renamed user.
type: string
minLength: 1
user_groups:
description: |
A list of the groups the renamed user belongs to. E.g. bot, sysop etc.
type: array
items:
type: string
minLength: 1
user_is_bot:
description: >
True if this user is considered to be a bot at the time of this event.
This is checked via the $user->isBot() method, which considers both
user_groups and user permissions.
type: boolean
prior_state:
description: >
The prior state of the entity before this event. If a top level entity
field is not present in this object, then its value has not changed since
the prior event.
type: object
required:
- user_text
properties:
user_text:
description: The text representation of the user before this event.
type: string
minLength: 1