Import MediaWiki To XWiki
Last modified by Vincent Massol on 2024/07/05 18:06
Import the contents of a MediaWiki instance to a XWiki instance |
Type | Other |
Category | |
Developed by | |
Rating | |
License | GNU Lesser General Public License 2.1 |
Table of contents
Description
It uses
- A MediaWiki XML dump (for instance the Wikipedia one, downloaded from 1)
- Dom4J for parsing Wikipedia XML contents
- WikiModel for converting MediaWiki syntax to XWiki syntax.
- The Groovy script below
This script should work with any MediaWiki exports. It remains to be improved for dealing with revisions, talks, images etc. Currently it imports only the text of the latest revision.
Groovy script
import org.dom4j.io.SAXReader
import org.dom4j.*
import groovy.net.xmlrpc.*
import java.net.ServerSocket
import org.wikimodel.wem.mediawiki.MediaWikiParser
import org.wikimodel.wem.xwiki.*
class PruningPageHandler implements ElementHandler {
def proxy, token;
def counter = 0;
def max = 10000;
PruningPageHandler(proxy, token) {
this.proxy = proxy
this.token = token
}
def messages = []
public void onStart(ElementPath path) { }
public void onEnd(ElementPath path) {
def page = path.current
def title = page.elementText('title')
title = title.replaceAll(' ','_')
def id = page.elementText('id')
println(title+ '('+counter+')')
def revision = page.element('revision')
def revid = revision.elementText('id');
def revtext = revision.elementText('text');
def contributor = revision.element('contributor')
def username = contributor.elementText('username')
def index = revtext.substring(0, Math.min(30,revtext.length())).toLowerCase().indexOf("redirect")
counter++;
if (counter < max && index < 0) {
revtext = revtext.replaceFirst("^-", "*");
revtext = revtext.replaceAll("__","")
revtext = revtext.replaceAll("[\\|][\\+]","")
def buffer = new StringBuffer()
buffer.append(revtext)
try {
def reader = new StringReader(revtext);
def parser = new MediaWikiParser();
buffer = new StringBuffer()
def listener = new XWikiSerializer(buffer);
parser.parse(reader, listener);
} catch (Exception e) {
println(e.getMessage())
}
def map = new HashMap()
map.put('content', buffer.toString())
map.put('modifier', username)
map.put('space','Wikipedia')
map.put('title',title)
try {
proxy.confluence1.storePage(token, map)
} catch (Exception e) {
println(e.getMessage())
}
}
page.detach() // prune the tree
}
}
def server = new XMLRPCServer()
def proxy = new XMLRPCServerProxy("http://xwikiserver/xwiki/xmlrpc/confluence")
def token = proxy.confluence1.login("","")
def reader = new SAXReader()
def handler = new PruningPageHandler(proxy, token)
File f = new File("/home/slauriere/enwiki-20070908-pages-articles.xml.bz2.1.out")
FileInputStream fis = new FileInputStream(f);
reader.addHandler('/mediawiki/page', handler)
reader.setEncoding('UTF-8')
reader.read(fis)
import org.dom4j.*
import groovy.net.xmlrpc.*
import java.net.ServerSocket
import org.wikimodel.wem.mediawiki.MediaWikiParser
import org.wikimodel.wem.xwiki.*
class PruningPageHandler implements ElementHandler {
def proxy, token;
def counter = 0;
def max = 10000;
PruningPageHandler(proxy, token) {
this.proxy = proxy
this.token = token
}
def messages = []
public void onStart(ElementPath path) { }
public void onEnd(ElementPath path) {
def page = path.current
def title = page.elementText('title')
title = title.replaceAll(' ','_')
def id = page.elementText('id')
println(title+ '('+counter+')')
def revision = page.element('revision')
def revid = revision.elementText('id');
def revtext = revision.elementText('text');
def contributor = revision.element('contributor')
def username = contributor.elementText('username')
def index = revtext.substring(0, Math.min(30,revtext.length())).toLowerCase().indexOf("redirect")
counter++;
if (counter < max && index < 0) {
revtext = revtext.replaceFirst("^-", "*");
revtext = revtext.replaceAll("__","")
revtext = revtext.replaceAll("[\\|][\\+]","")
def buffer = new StringBuffer()
buffer.append(revtext)
try {
def reader = new StringReader(revtext);
def parser = new MediaWikiParser();
buffer = new StringBuffer()
def listener = new XWikiSerializer(buffer);
parser.parse(reader, listener);
} catch (Exception e) {
println(e.getMessage())
}
def map = new HashMap()
map.put('content', buffer.toString())
map.put('modifier', username)
map.put('space','Wikipedia')
map.put('title',title)
try {
proxy.confluence1.storePage(token, map)
} catch (Exception e) {
println(e.getMessage())
}
}
page.detach() // prune the tree
}
}
def server = new XMLRPCServer()
def proxy = new XMLRPCServerProxy("http://xwikiserver/xwiki/xmlrpc/confluence")
def token = proxy.confluence1.login("","")
def reader = new SAXReader()
def handler = new PruningPageHandler(proxy, token)
File f = new File("/home/slauriere/enwiki-20070908-pages-articles.xml.bz2.1.out")
FileInputStream fis = new FileInputStream(f);
reader.addHandler('/mediawiki/page', handler)
reader.setEncoding('UTF-8')
reader.read(fis)
Todos
- Fix pending WikiModel converter issues (tables, upper case, etc.)
- Work directly on a compressed file
- First letter of a link should be upper case, see for instance "autism spectrum disorder" at http://en.wikipedia.org/w/index.php?title=Albedo&action=edit
- Issues on the following pages:
- Albedo
- Adobe
- ...