Bruger:Wegge/Statistik/DumpToDat.py

Many eys makes all bugs shallow

Dette er version 0.1 af det program, skrevet i Python, som jeg bruger til at maltraktere et XML-dump, så der kommer plotbare data ud af det. Det er ikke testet synderlig meget, så hvis du finder en fejl, vil jeg gerne høre om den. Programmet er frigivet under GPL, hvilket er tilstrækkeligt tæt på GFDL, til at kildeteksten kan ligge på Wikipedia.

#! /usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright(2006) Anders Wegge Jakobsen
# Available under the FSF GPL license
#
from xml.sax import saxutils
from xml.sax import make_parser

import sys, time, exceptions, calendar
import profile
# Page::Title, Page::FirstRevDate
# User::FirstEditDate
# Date::Users, Date::Pages

debug = 0

danamespaces = [ 'Media', 'Speciel', 'Diskussion', 'Bruger',
                 'Bruger diskussion', 'Wikipedia', 'Wikipedia diskussion', 
                 'Billede', 'Billede diskussion', 'MediaWiki',
                 'MediaWiki diskussion', 'Skabelon', 'Skabelon diskussion', 
                 u'Hjælp', u'Hjælp diskussion', 'Kategori',
                 'Kategori diskussion', 'WikiWegge' ]

darobots = [ 'WeggeBot', 'TwidRobot' ]

class DayStat:
    def __init__(self):
        self.TimeStamp = 0
        self.NewArticlesUser = 0
        self.NewArticlesRobot = 0
        self.NewUsers = 0

class WikiArticle:
    def __init__ (self):
        self.Title = ''
        self.Oldest = 0
        self.Newest = 0
        self.isArticleNs = False
        self.isRobotGen = False
        self.isRedirect = False
       
class WikiUser:
    def __init__ (self):
        self.Name = ''
        self.First = 0
        self.Last = 0
       
class WikiDump (saxutils.DefaultHandler):
    def __init__(self):
        # Parser state
        self.nowIn = []

        self.UserName = []
        self.Title = []
        self.Text = []
        self.TimeStamp = 0
        
        # Users -> First edit date
        self.Users = {}

        # Page -> First revision, Title, isRedirect
        self.Pages = {}

    def isRedirect(self, text):
        # Is this page a redirect?
        return text.find("#REDIRECT [[") == 0

    def isRealArticle(self, text):
        # Is this page a real article?
        return text.find("[[") != 0 and not self.isRedirect(text)

    def isArticleNs(self, title):
        # Is this an article in the main namespace?
        if ':' in title:
            for ns in danamespaces:
                if ns + ':' not in title:
                    return False
        else:
            return True

    def isRobot(self, contributor):
        # Is this an article in the main namespace?
        return contributor in darobots
        
    def characters(self, ch):
        if self.nowIn[-1] == 'title':
            self.Title.append(ch)
        if self.nowIn[-1] == 'timestamp':
            self.TimeStampText.append(ch)
        if self.nowIn[-1] == 'username':
            self.UserName.append(ch)
        if self.nowIn[-1] == 'text':
            self.Text.append(ch)
            
    def startElement (self, name, attrs):
        self.nowIn.append(name)
        if debug > 9:
            print '>> ' + name
        if name == 'title':
            self.Title = []
        if name == 'timestamp':
            self.TimeStampText = []
        if name == 'username':
            self.UserName = []
        if name == 'text':
            self.Text = []

    def endElement (self, name):
        if debug > 9:
            print '<<' + name
        if self.nowIn.pop() != name:
            print 'Something is rotten!'
            print 'Removing: ' + name + ' from: '
            print self.nowIn 
            raise Exception

        if name == 'title':
            self.Title = "".join(self.Title)
            return
        
        if name == 'revision':
            self.Text = "".join(self.Text)
            # We now have the revision text and timestamp

            if self.Title in self.Pages:
                wa = self.Pages[self.Title]
                if wa.Oldest > self.TimeStamp:
                    wa.isRobotGen = self.isRobot(self.UserName)
                if wa.Newest < self.TimeStamp:
                    wa.isRedirect = self.isRedirect(self.Text)
            else:
                wa = WikiArticle()
                wa.isArticleNs = self.isArticleNs(self.Title)
                wa.Title = self.Title
                wa.Oldest = 0 + self.TimeStamp # Trip
                wa.Newest = self.TimeStamp
                wa.isRobotGen = self.isRobot(self.UserName)
                wa.isRedirect = self.isRedirect(self.Text)
            self.Pages[self.Title] = wa
            return
        
        if name == 'timestamp':
            self.TimeStampText = "".join(self.TimeStampText)
            tmt = time.strptime(self.TimeStampText, '%Y-%m-%dT%H:%M:%SZ')
            self.TimeStamp = calendar.timegm((tmt.tm_year, tmt.tm_mon,
                                              tmt.tm_mday, 0, 0, 0))
            return
            
        if name == 'contributor':
            
            # Was it a registered user?
            self.UserName = "".join(self.UserName)
            if self.UserName in self.Users:
                wu = self.Users[self.UserName]
                if wu.First > self.TimeStamp:
                    wu.First = self.TimeStamp
                if wu.Last < self.TimeStamp:
                    wu.Last = self.TimeStamp
                self.Users[self.UserName] = wu
            elif self.UserName:
                wu = WikiUser()
                wu.Name = self.UserName
                wu.First = self.TimeStamp
                wu.Last = self.TimeStamp
                self.Users[self.UserName] = wu
            return

def main():
    filename = 'dawiki-20060220-pages-meta-history.xml'
#    filename = '/tmp/wikiwegge.xml'

    wd = WikiDump()

    parser = make_parser()
    parser.setContentHandler(wd)
    parser.parse(filename)

    hist = {}

    for i in wd.Users.values():
        if i.First in hist:
            h = hist[i.First]
        else:
            h = DayStat()
            h.TimeStamp = i.First
        h.NewUsers += 1
        hist[i.First] = h
        
    for i in wd.Pages.values():
        if i.Oldest in hist:
            h = hist[i.Oldest]
        else:
            h = DayStat()
            h.TimeStamp = i.Oldest
        if i.isRobotGen and not i.isRedirect and i.isArticleNs:
            h.NewArticlesRobot += 1
        elif not i.isRobotGen and not i.isRedirect and i.isArticleNs:
            h.NewArticlesUser += 1
        hist[i.Oldest] = h

    NewPagesUsersByDate = hist.values()
    NewPagesUsersByDate.sort()

    #
    # And now ... output
    #

    print '#\n# Stats based on %s\n#\n' % filename
    print '# Timestamp, Number of users, Total pages by user,' \
          +' Total pages by robots'

    users = 0
    userpages = 0
    botpages = 0
    
    for i in NewPagesUsersByDate:
        users += i.NewUsers
        botpages += i.NewArticlesRobot
        userpages += i.NewArticlesUser
        
        print '%d %d %d %d' % (i.TimeStamp, users, userpages, botpages)
    
if __name__ == "__main__":
    main()