4chan_downloader.py (2,19 KB)
import re, urllib.request, http.client, io, json, os, os.path, shutil
VIERCHAN_URL = re.compile("https?://boards\.4chan\.org/(?P<board>[a-z0-9]+)/res/(?P<threadid>[0-9]+)")
API_URL = "https://a.4cdn.org/{board}/res/{threadid}.json"
PICTURE_HOST = "i.4cdn.org"
PICTURE_URL = "/{board}/src/{tim}{ext}"
THUMB_HOST = "t.4cdn.org"
THUMB_URL = "/{board}/thumb/{tim}s.jpg"
dat_url = 'https://boards.4chan.org/b/res/521480591'
match = VIERCHAN_URL.match(dat_url)
if match:
board = match.group('board')
threadid = match.group('threadid')
api = urllib.request.urlopen(API_URL.format(board=board, threadid=threadid))
api_json = json.load(io.TextIOWrapper(api, encoding='utf-8'))
prefix = "{board}_{threadid}".format(board=board, threadid=threadid)
json_file = open(prefix + '.json', mode='wt', encoding='utf-8')
json.dump(api_json, json_file)
json_file.close()
picdir = prefix + os.path.sep + 'src'
if not os.path.isdir(picdir):
os.makedirs(picdir)
thumbdir = prefix + os.path.sep + 'thumb'
if not os.path.isdir(thumbdir):
os.makedirs(thumbdir)
picconn = http.client.HTTPSConnection(PICTURE_HOST)
thumbconn = http.client.HTTPSConnection(THUMB_HOST)
for post in api_json['posts']:
if 'tim' in post:
picurl = PICTURE_URL.format(board=board, tim=post['tim'], ext=post['ext'])
picconn.request('GET', picurl)
picresp = picconn.getresponse()
if picresp.status == 200:
picfile = open(picdir + os.path.sep + "{tim}{ext}".format(tim=post['tim'], ext=post['ext']), 'wb')
shutil.copyfileobj(picresp, picfile)
picfile.close()
thumburl = THUMB_URL.format(board=board, tim=post['tim'], ext=post['ext'])
thumbconn.request('GET', thumburl)
thumbresp = thumbconn.getresponse()
if thumbresp.status == 200:
thumbfile = open(thumbdir + os.path.sep + "{tim}s.jpg".format(tim=post['tim']), 'wb')
shutil.copyfileobj(thumbresp, thumbfile)
thumbfile.close()
picconn.close()
thumbconn.close()