I2P Pastebin parser

27 May 2011

I forked a Pastebin parser I read about on Hacker News and enabled it to parse the (I only know of one) pastebin hosted on I2P.
I thought that I would stumble upon lots of more interesting stuff on the I2P pastebin than on the vanillla internet version. Turns out I didn’t.
The anonymous people chilling out on I2P seems like a really friendly bunch :).
The only thing I really found of interest is a script to spoof MAC addresses on AirPorts. (Then again there aren’t alot of pastes in the i2p pastebin (yet!))

Here is the code to parse the I2P pastebin (you need to have BeautifulSoup and an I2P proxy running on 4444 for this to work).

import BeautifulSoup 
import urllib2 
import time 
import Queue 
import threading 
import sys 
import datetime
import random 
import os 
pastesseen = set() 
pastes = Queue.Queue()
proxy = {"http": "http://127.0.0.1:4444/"}
handler = urllib2.ProxyHandler(proxy)
opener = urllib2.build_opener(handler)
def downloader():
	while True: 
		paste = pastes.get() 
		fn = "pastebinsi2p/%s-%s.txt" % (paste, datetime.datetime.today().strftime("%Y-%m-%d")) 
		content = opener.open("http://empth.i2p/pastebin/" + paste).read() 
		soup = BeautifulSoup.BeautifulSoup(content) 
		content = str(soup.find(id="content"))
		if "requesting a little bit too much" in content: 
			print "Throttling... requeuing %s" % paste 
			pastes.put(paste) 
			time.sleep(0.1) 
		else: 
			f = open(fn, "wt") 
			f.write(content) 
			f.close() 
		delay = 1.1 # random.uniform(1, 3) 
		sys.stdout.write("Downloaded %s, waiting %f sec\n" % (paste, delay)) 
		time.sleep(delay) 
		pastes.task_done() 
def scraper(): 
	scrapecount = 0 
	while scrapecount < 10: 
		html = opener.open("http://empth.i2p/pastebin/recent.php").read() 
		soup = BeautifulSoup.BeautifulSoup(html) 
		div = soup.find(id="recent") 
		ul = div.find("ul")
		for li in ul.findAll("li"): 
			href = li.a["href"] 
			if href in pastesseen: 
				sys.stdout.write("%s already seen\n" % href) 
			else: 
				pastes.put(href) 
				pastesseen.add(href) 
				sys.stdout.write("%s queued for download\n" % href) 
			delay = 12 # random.uniform(6,10) 
			time.sleep(delay) 
			scrapecount += 1 
num_workers = 1 
for i in range(num_workers): 
	t = threading.Thread(target=downloader) 
	t.setDaemon(True) 
	t.start() 
if not os.path.exists("pastebinsi2p"): 
	os.mkdir("pastebinsi2p") # Thanks, threecheese! 
s = threading.Thread(target=scraper) 
s.start() 
s.join()	

It is forked from this blog post (hope you don’t mind mate).

blog comments powered by Disqus