diff --git a/flickrripper.py b/flickrripper.py index f6563e6..ce41b76 100644 --- a/flickrripper.py +++ b/flickrripper.py @@ -1,560 +1,573 @@ #!/usr/bin/python # -*- coding: utf-8 -*- """ A tool to transfer flickr photos to Wikimedia Commons. The following parameters are supported: -group_id: specify group ID of the pool -photoset_id: specify a photoset id -user_id: give the user id of the flickrriper user -start_id: the photo id to start with -end_id: the photo id to end with -tags: a tag to filter photo items (only one is supported) -flickerreview add a flickr review template to the description -reviewer: specify the reviewer -override: override text for licence -addcategory: specify a category -removecategories remove all categories -autonomous run bot in autonomous mode """ # # (C) Multichill, 2009 # (C) Pywikibot team, 2009-2020 # # Distributed under the terms of the MIT license. # from __future__ import absolute_import, division, unicode_literals import base64 import hashlib import io import re import pywikibot from pywikibot import config, textlib from pywikibot.comms.http import fetch from pywikibot.specialbots import UploadRobot from pywikibot.tools import PY2 try: from pywikibot.userinterfaces.gui import Tkdialog except ImportError as _tk_error: Tkdialog = _tk_error if not PY2: from urllib.parse import urlencode else: from urllib import urlencode try: import flickrapi # see: http://stuvel.eu/projects/flickrapi except ImportError as e: flickrapi = e # see https://www.flickr.com/services/api/flickr.photos.licenses.getInfo.html flickr_allowed_license = { 0: False, # All Rights Reserved 1: False, # Creative Commons Attribution-NonCommercial-ShareAlike License 2: False, # Creative Commons Attribution-NonCommercial License 3: False, # Creative Commons Attribution-NonCommercial-NoDerivs License 4: True, # Creative Commons Attribution License 5: True, # Creative Commons Attribution-ShareAlike License 6: False, # Creative Commons Attribution-NoDerivs License 7: True, # No known copyright restrictions 8: True, # United States Government Work 9: True, # Public Domain Dedication (CC0) 10: True, # Public Domain Mark } def getPhoto(flickr, photo_id): """ Get the photo info and the photo sizes so we can use these later on. TODO: Add exception handling """ while True: try: photoInfo = flickr.photos_getInfo(photo_id=photo_id) # xml.etree.ElementTree.dump(photoInfo) photoSizes = flickr.photos_getSizes(photo_id=photo_id) # xml.etree.ElementTree.dump(photoSizes) return photoInfo, photoSizes except flickrapi.exceptions.FlickrError: pywikibot.output('Flickr api problem, sleeping') pywikibot.sleep(30) def isAllowedLicense(photoInfo): """ Check if the image contains the right license. TODO: Maybe add more licenses """ license = photoInfo.find('photo').attrib['license'] if flickr_allowed_license[int(license)]: return True else: return False def getPhotoUrl(photoSizes): """Get the url of the jpg file with the highest resolution.""" url = '' # The assumption is that the largest image is last for size in photoSizes.find('sizes').findall('size'): url = size.attrib['source'] return url def downloadPhoto(photoUrl): """ Download the photo and store it in a io.BytesIO object. TODO: Add exception handling """ imageFile = fetch(photoUrl).raw return io.BytesIO(imageFile) def findDuplicateImages(photo, site=None): """Find duplicate images. Take the photo, calculate the SHA1 hash and ask the MediaWiki api for a list of duplicates. TODO: Add exception handling. @param photo: Photo @type photo: io.BytesIO @param site: Site to search for duplicates. Defaults to using Wikimedia Commons if not supplied. @type site: pywikibot.site.APISite or None """ if not site: site = pywikibot.Site('commons', 'commons') hashObject = hashlib.sha1() hashObject.update(photo.getvalue()) return site.getFilesFromAnHash(base64.b16encode(hashObject.digest())) def getTags(photoInfo): """Get all the tags on a photo.""" result = [] for tag in photoInfo.find('photo').find('tags').findall('tag'): # see https://www.flickr.com/services/api/misc.tags.html , use original tag name result.append(tag.attrib['raw'].lower()) return result def getFlinfoDescription(photo_id): """ Get the description from http://wikipedia.ramselehof.de/flinfo.php. TODO: Add exception handling, try a couple of times """ parameters = urlencode({'id': photo_id, 'raw': 'on'}) return fetch( 'http://wikipedia.ramselehof.de/flinfo.php?%s' % parameters).text +def maxTitle(title, project='Flickr', username=''): + """Find the max length for a mw title""" + maxBytes = 240 - len(project.encode('utf-8')) \ + - len(username.encode('utf-8')) + titleBytes = len(title.encode('utf-8')) + if titleBytes > maxBytes: + # maybe we cut more than needed, anyway we do it + items = max(min(len(title), maxBytes // 4), + len(title) - titleBytes + maxBytes) + title = title[:items] + return title + + def getFilename(photoInfo, site=None, project='Flickr', photo_url=None): """Build a good filename for the upload based on the username and title. Prevents naming collisions. """ if not site: site = pywikibot.Site('commons', 'commons') username = photoInfo.find('photo').find('owner').attrib['username'] username = cleanUpTitle(username) title = photoInfo.find('photo').find('title').text if title: title = cleanUpTitle(title) + title = maxTitle(title, project=project, username=username) if not title: - # find the max length for a mw title - maxBytes = 240 - len(project.encode('utf-8')) \ - - len(username.encode('utf-8')) description = photoInfo.find('photo').find('description').text if description: - descBytes = len(description.encode('utf-8')) - if descBytes > maxBytes: - # maybe we cut more than needed, anyway we do it - items = max(min(len(description), maxBytes // 4), - len(description) - descBytes + maxBytes) - description = description[:items] title = cleanUpTitle(description) + title = maxTitle(title, project=project, username=username) else: title = photoInfo.find('photo').attrib['id'] # Should probably have the id of the photo as last resort. fileformat = photoInfo.find('photo').attrib['originalformat'] if not fileformat and photo_url: # TODO: in autonomous mode, If not fileformat and not photo_url, # it will pause and wait for the user to enter the file format. - fileformat = photo_url.split(".")[-1] + fileformat = photo_url.split('.')[-1] if pywikibot.Page(site, 'File:{} - {} - {}.{}' .format(title, project, username, fileformat)).exists(): i = 1 while True: name = '{} - {} - {} ({}).{}'.format(title, project, username, i, fileformat) if pywikibot.Page(site, 'File:' + name).exists(): i += 1 else: + if len(name) > 240: + diff = len(name.encode('utf-8')) - 240 + name = name[diff:] return name else: - return '{} - {} - {}.{}'.format(title, project, username, fileformat) + name = '{} - {} - {}.{}'.format(title, project, username, fileformat) + if len(name) > 240: + diff = len(name.encode('utf-8')) - 240 + name = name[diff:] + return name def cleanUpTitle(title): """Clean up the title of a potential MediaWiki page. Otherwise the title of the page might not be allowed by the software. """ title = title.strip() title = re.sub(r'[<{\[]', '(', title) title = re.sub(r'[>}\]]', ')', title) title = re.sub(r'[ _]?\(!\)', '', title) title = re.sub(',:[ _]', ', ', title) title = re.sub('[;:][ _]', ', ', title) title = re.sub(r'[\t\n ]+', ' ', title) title = re.sub(r'[\r\n ]+', ' ', title) title = re.sub('[\n]+', '', title) title = re.sub('[?!]([.\"]|$)', r'\1', title) title = re.sub('[&#%?!]', '^', title) title = re.sub('[;]', ',', title) title = re.sub(r'[/+\\:]', '-', title) title = re.sub('--+', '-', title) title = re.sub(',,+', ',', title) title = re.sub('[-,^]([.]|$)', r'\1', title) title = title.replace(' ', '_') title = title.replace('|', ',') title = title.strip('_') return title def buildDescription(flinfoDescription='', flickrreview=False, reviewer='', override='', addCategory='', removeCategories=False): """Build the final description for the image. The description is based on the info from flickrinfo and improved. """ # http://wikipedia.ramselehof.de/flinfo.php output format update description = flinfoDescription # use template {{Taken on}} datetaken = re.search(r'\|Date=(.*)\n', description).group(1) if datetaken: datetaken = '{{Taken on|%s}}' % (datetaken) description = re.sub(r'\|Date=.*\n', "|Date=%s\n" % (datetaken), description) if removeCategories: description = textlib.removeCategoryLinks(description, pywikibot.Site( 'commons', 'commons')) if override: description = description.replace('{{cc-by-sa-2.0}}\n', '') description = description.replace('{{cc-by-2.0}}\n', '') description = description.replace('{{flickrreview}}\n', '') description = description.replace( '{{copyvio|Flickr, licensed as "All Rights Reserved" which is not ' 'a free license --~~~~}}\n', '') description = description.replace('=={{int:license}}==', '=={{int:license}}==\n' + override) elif flickrreview: if reviewer: description = description.replace( '{{flickrreview}}', '{{flickrreview|%s|' '{{subst:CURRENTYEAR}}-{{subst:CURRENTMONTH}}-' '{{subst:CURRENTDAY2}}}}' % reviewer) if addCategory: description = description.replace('{{subst:unc}}\n', '') description = description + '\n[[Category:' + addCategory + ']]\n' elif '{{subst:unc}}' not in description: # Request category check description = description + '{{subst:chc}}\n' description = description.replace('\r\n', '\n') return description def processPhoto(flickr, photo_id='', flickrreview=False, reviewer='', override='', addCategory='', removeCategories=False, autonomous=False): """Process a single Flickr photo. For each image: * Check the license * Check if it isn't already on Commons * Build suggested filename * Check for name collision and maybe alter it * Pull description from Flinfo * Show image and description to user * Add a nice hotcat lookalike for the adding of categories * Filter the categories * Upload the image """ if photo_id: pywikibot.output(str(photo_id)) (photoInfo, photoSizes) = getPhoto(flickr, photo_id) if isAllowedLicense(photoInfo) or override: # Get the url of the largest photo photoUrl = getPhotoUrl(photoSizes) # Should download the photo only once photo = downloadPhoto(photoUrl) # Don't upload duplicate images, should add override option duplicates = findDuplicateImages(photo) if duplicates: pywikibot.output('Found duplicate image at {}' .format(duplicates.pop())) else: filename = getFilename(photoInfo, photo_url=photoUrl) flinfoDescription = getFlinfoDescription(photo_id) photoDescription = buildDescription(flinfoDescription, flickrreview, reviewer, override, addCategory, removeCategories) # pywikibot.output(photoDescription) if not isinstance(Tkdialog, ImportError) and not autonomous: try: (newPhotoDescription, newFilename, skip) = Tkdialog( photoDescription, photo, filename).show_dialog() except ImportError as e: pywikibot.warning(e) pywikibot.warning('Switching to autonomous mode.') autonomous = True elif not autonomous: pywikibot.warning('Switching to autonomous mode because GUI ' 'interface cannot be used') pywikibot.warning(Tkdialog) autonomous = True if autonomous: newPhotoDescription = photoDescription #if not photoInfo.find('photo').attrib['originalformat']: # fileformat = photoUrl.split(".")[-1] # newFilename = filename + fileformat #else: newFilename = filename skip = False # Do the actual upload # Would be nice to check before I upload if the file is already at # Commons. Not that important for this program, but maybe for # derived programs if not skip: bot = UploadRobot(photoUrl, description=newPhotoDescription, useFilename=newFilename, keepFilename=True, verifyDescription=False) bot.upload_image(debug=False) return 1 else: pywikibot.output('Invalid license') return 0 def getPhotos(flickr, user_id='', group_id='', photoset_id='', start_id='', end_id='', tags=''): """Loop over a set of Flickr photos. Get a set to work on (start with just a username). * Make it possible to delimit the set (from/to) """ found_start_id = not start_id # https://www.flickr.com/services/api/flickr.groups.pools.getPhotos.html # Get the photos in a group if group_id: # First get the total number of photo's in the group photos = flickr.groups_pools_getPhotos(group_id=group_id, user_id=user_id, tags=tags, per_page='100', page='1') pages = photos.find('photos').attrib['pages'] def gen(i): return list(flickr.groups_pools_getPhotos( group_id=group_id, user_id=user_id, tags=tags, per_page='100', page=i ).find('photos')) # https://www.flickr.com/services/api/flickr.photosets.getPhotos.html # Get the photos in a photoset elif photoset_id: photos = flickr.photosets_getPhotos(photoset_id=photoset_id, per_page='100', page='1') pages = photos.find('photoset').attrib['pages'] def gen(i): return list(flickr.photosets_getPhotos( photoset_id=photoset_id, per_page='100', page=i ).find('photoset')) # https://www.flickr.com/services/api/flickr.people.getPublicPhotos.html # Get the (public) photos uploaded by a user elif user_id: photos = flickr.people_getPublicPhotos(user_id=user_id, per_page='100', page='1') pages = photos.find('photos').attrib['pages'] def gen(i): return list(flickr.people_getPublicPhotos( user_id=user_id, per_page='100', page=i ).find('photos')) for i in range(1, int(pages) + 1): gotPhotos = False while not gotPhotos: try: for photo in gen(i): gotPhotos = True if photo.attrib['id'] == start_id: found_start_id = True if found_start_id: if photo.attrib['id'] == end_id: pywikibot.output('Found end_id') return else: yield photo.attrib['id'] except flickrapi.exceptions.FlickrError: gotPhotos = False pywikibot.output('Flickr api problem, sleeping') pywikibot.sleep(30) return def main(*args): """ Process command line arguments and invoke bot. If args is an empty list, sys.argv is used. @param args: command line arguments @type args: str """ local_args = pywikibot.handle_args(args) group_id = '' photoset_id = '' user_id = '' start_id = '' end_id = '' tags = '' addCategory = '' removeCategories = False autonomous = False totalPhotos = 0 uploadedPhotos = 0 # Do we mark the images as reviewed right away? if config.flickr['review']: flickrreview = config.flickr['review'] else: flickrreview = False # Set the Flickr reviewer if config.flickr['reviewer']: reviewer = config.flickr['reviewer'] elif 'commons' in config.usernames['commons']: reviewer = config.usernames['commons']['commons'] else: reviewer = '' # Should be renamed to overrideLicense or something like that override = '' for arg in local_args: if arg.startswith('-group_id'): if len(arg) == 9: group_id = pywikibot.input('What is the group_id of the pool?') else: group_id = arg[10:] elif arg.startswith('-photoset_id'): if len(arg) == 12: photoset_id = pywikibot.input('What is the photoset_id?') else: photoset_id = arg[13:] elif arg.startswith('-user_id'): if len(arg) == 8: user_id = pywikibot.input( 'What is the user_id of the flickr user?') else: user_id = arg[9:] elif arg.startswith('-start_id'): if len(arg) == 9: start_id = pywikibot.input( 'What is the id of the photo you want to start at?') else: start_id = arg[10:] elif arg.startswith('-end_id'): if len(arg) == 7: end_id = pywikibot.input( 'What is the id of the photo you want to end at?') else: end_id = arg[8:] elif arg.startswith('-tags'): if len(arg) == 5: tags = pywikibot.input( 'What is the tag you want to filter out (currently only ' 'one supported)?') else: tags = arg[6:] elif arg == '-flickrreview': flickrreview = True elif arg.startswith('-reviewer'): if len(arg) == 9: reviewer = pywikibot.input('Who is the reviewer?') else: reviewer = arg[10:] elif arg.startswith('-override'): if len(arg) == 9: override = pywikibot.input('What is the override text?') else: override = arg[10:] elif arg.startswith('-addcategory'): if len(arg) == 12: addCategory = pywikibot.input( 'What category do you want to add?') else: addCategory = arg[13:] elif arg == '-removecategories': removeCategories = True elif arg == '-autonomous': autonomous = True if isinstance(flickrapi, ImportError): pywikibot.bot.suggest_help(missing_dependencies=('flickrapi',)) elif not config.flickr['api_key']: additional_text = ( 'Flickr api key not found! Get yourself an api key\n' 'Any flickr user can get a key at\n' 'https://www.flickr.com/services/api/keys/apply/') pywikibot.bot.suggest_help(additional_text=additional_text) elif user_id or group_id or photoset_id: if 'api_secret' in config.flickr and config.flickr['api_secret']: flickr = flickrapi.FlickrAPI(config.flickr['api_key'], config.flickr['api_secret']) else: pywikibot.output('Accessing public content only') flickr = flickrapi.FlickrAPI(config.flickr['api_key']) for photo_id in getPhotos(flickr, user_id, group_id, photoset_id, start_id, end_id, tags): uploadedPhotos += processPhoto(flickr, photo_id, flickrreview, reviewer, override, addCategory, removeCategories, autonomous) totalPhotos += 1 pywikibot.output('Finished running') pywikibot.output('Total photos: ' + str(totalPhotos)) pywikibot.output('Uploaded photos: ' + str(uploadedPhotos)) if __name__ == '__main__': main()