#(c) 2007-2010 Paul Marques Mota
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
import datetime
import getopt
import gzip
import pprint
import re
import simplejson
import StringIO
import sys
import time
import urllib
import urllib2
QUERY_URL = ""
GZIP = True
FROM_LANGUAGE = "pt"
TO_LANGUAGE = "fr"
uc_count = 500
if GZIP:
HEADERS = {
"User-Agent": "User:Comte0 v2.0",
"Accept-encoding": "gzip"
}
else:
HEADERS = {
"User-Agent": "User:Comte0 v2.0"
}
def Query(**args):
args.update({
"format" : "json", # Output in JSON format
})
req = urllib2.Request(QUERY_URL, urllib.urlencode(args), HEADERS)
data = False
while data == False:
if GZIP:
url = urllib2.urlopen(req)
compresseddata = url.read()
compressedstream = StringIO.StringIO(compresseddata)
gzipper = gzip.GzipFile(fileobj=compressedstream)
content = gzipper.read()
data = simplejson.loads(content)
else:
url = urllib2.urlopen(req)
content = url.read()
data = simplejson.loads(content)
return data
def Process_Interwiki(page):
data = Query(action="query", prop="langlinks", titles=page, lllimit=uc_count, redirects="true")
if "redirects" in data["query"]:
return
if "langlinks" in data["query"]["pages"][data["query"]["pages"].keys()[0]]:
for i,lang in \
enumerate(data["query"]["pages"][data["query"]["pages"].keys()[0]]["langlinks"]):
if lang["lang"] == TO_LANGUAGE:
return
print '*[[' + page + ']] ([[:' + FROM_LANGUAGE+ ':' + page + '|' + FROM_LANGUAGE + ']])'
def Process(category):
parse_count = 0
user_titles = []
uc_next = " "
while uc_next != "":
if uc_next == " ":
data = Query(action="query", list="categorymembers", cmtitle=category, cmlimit=uc_count)
else:
if DEBUG:
print >> sys.stderr
print >> sys.stderr, "\tucstart=" + uc_next
data = Query(action="query", list="categorymembers", cmtitle=category, cmlimit=uc_count, cmstartsortkey=uc_next)
for i,item in enumerate(data["query"]["categorymembers"]):
title = item["title"].encode("utf8","replace")
if DEBUG:
print >> sys.stderr, "\tprocessing: " + title
if item["ns"] == 0:
Process_Interwiki(title)
if item["ns"] == 1 and FROM_LANGUAGE == "ja":
Process_Interwiki(title[3:])
if item["ns"] == 14 and SUBCAT == True:
Process(title)
if not "query-continue" in data:
uc_next = ""
else:
uc_next = data["query-continue"]["categorymembers"]["cmcontinue"]
USAGE = 'Usage: python query_categories.py [-d|--debug] [-s|--subcat] Category from_language to_language'
if len(sys.argv) < 4:
print >> sys.stderr, USAGE
else:
try:
opts, args = getopt.getopt(sys.argv[1:], "ds", ["debug", "subcat"])
except getopt.GetoptError, err:
# print help information and exit:
print >> sys.stderr, str(err) # will print something like "option -a not recognized"
print >> sys.stderr, USAGE
sys.exit(2)
DEBUG = False
SUBCAT = False
for o, a in opts:
if o in ("-d", "--debug"):
DEBUG = True
elif o in ("-s", "--subcat"):
SUBCAT = True
else:
assert False, "unhandled option"
TO_LANGUAGE = args[2]
FROM_LANGUAGE = args[1]
QUERY_URL = u"http://" + args[1] + ".wikipedia.org/w/api.php"
Process(args[0])