2008年1月15日星期二

下載Flickr裏的照片

某天澳門有個cosplay比賽
貌似規模不是很大的
那天有個『照相佬』,我八掛問了他要相
這裏
原來他用的是Flickr
我見要一張張下載很麻煩,所以就想寫個小程式一口氣下載回來
不過,這已經是一個月前的事了OTL

當然,我不是花了一個月寫這個小程式
事實上是昨晚開始寫的
敲了幾行代碼後就顧着和別人聊天
今天早上起床繼續寫的
其實都花了不少時間
………………………………
現在給出代碼
#!/usr/bin/env python
# coding=utf-8

from xml.dom import minidom, Node
import urllib2, HTMLParser
import os, sys, re

class MyHTMLParser(HTMLParser.HTMLParser):
"""my HTMLParser class"""
def __init__(self):
"""init"""
HTMLParser.HTMLParser.__init__(self)
# creat a list to recode the pictures
self.pics = []
# if it has next page, it will be the URL of next page
self.go_next = ""
self.links = []
# mark if found the target p tage
self.found = 0
self.download = 0

def handle_starttag(self, tags, attrs):
"""if HTMLParser find an open tag, will do this function"""

if tags == 'p':
for attr in attrs:
if attr[0] == 'class' and attr[1] == 'Photo':
# find the photo link
self.found = 1
if tags == 'a':
# if the tag is <a>, jot it down
for attr in attrs:
# find the href
if (attr[0] == 'href'):
t = attr[1]
if self.found:
self.links.append(t)
# find the next page link
elif (attr[0] == 'class' and attr[1] == 'Next'):
self.go_next = t


elif tags == 'img' and self.download:
# if it is the final part, get the final picture's url
for attr in attrs:
if attr[0] == 'src':
os.system("wget "+attr[1])

def handle_endtag(self, tag):
if tag == 'p': self.found = 0

def getPicSrc(self, attrs):
"""get the picture src"""
pic_name = ""
pic_src = ""
for attr in attrs:
# find the information in that tag's attrs
# get the alt value
if (attr[0] == 'alt'): pic_name = attr[1]
# get the src value
if (attr[0] == 'src'): pic_src = attr[1]
if (pic_src):
# if the information is correct, add it to pictures list
pic = "%s: %s" % ( pic_name, pic_src)
self.pics.append(pic)

def old_result(self):
"""return the result"""
pics_list = ""
# creat a string for print out
for i in self.pics:
# find the jpg image
s = re.findall('.*jpg', i, re.I | re.U)
# if find, add them to the pictures list
if ( s ): pics_list += "%s \n" % s[0]
return pics_list

def result(self):
"""return the result"""
links_list = ""
for i in self.links:
links_list += "%s\n" % i
return links_list

class main:
"""main class"""
def __init__(self, url):
# last processing url
self.page_url = ""
self.link_list = []
# the url of next page
self.next_page = ""
print "Program Start"
self.getPage(url)
print "Search all pictures Finished!!"
self.getPic(self.link_list)

def getPic(self, link_list):
"""get the Pictures from Flickr"""
img = "http://www.flickr.com/photo_zoom.gne?id=%s&size=l"
for i in link_list:
for j in i:
id = j.split('/')[-2]
data = self.goLink(img % id)
if (data):
parser = MyHTMLParser()
parser.download = 1
parser.feed(data)

def getPage(self, url):
"""get the Flickr HTML Page"""
# get the data
data = self.goLink(url)
if ( data ):
# creat the HTML Parser
parser = MyHTMLParser()
# get the information
parser.feed(data)
#print parser.result()
self.link_list.append(parser.links)
while (parser.go_next):
print "Next: %s" % parser.go_next
data = self.goLink(parser.go_next)
if data:
parser = MyHTMLParser()
parser.feed(data)
#print parser.result()
self.link_list.append(parser.links)

def getPageDOM(self, url):
"""get the Flickr HTML Page information using DOM"""
# get the dtat first
data = self.goLink(url)
doc = minidom.parseString(data)
for child in doc.childNodes:
if child.nodeType == Node.ELEMENT_NODE and child.tagName == 'body':
# get the body, tree root
self.handle(child)

def handle(self, node):
"""get the information from child"""
for child in node.childNodes:
# check it is element node
if child.nodeType == Node.ELEMENT_NODE:
# get the next page link
if child.tagName == 'a':
if child.getAttribute('class') == 'Next':
self.next_page = child.getAttribute('href')

def goLink(self, input_url):
"""open a url, and return the data"""
# check the url
if ( input_url.startswith('http') ):
url = input_url
elif ( input_url.startswith('/') ):
a = self.page_url.split('/')
url = a[0] + '//' + a[2] + input_url
else:
a = self.page_url.rfind('/')
url = self.page_url[0:a] + input_url
print "Open URL: %s" % url
# creat a request
request = urllib2.Request(url)
# open the url
html_p = urllib2.urlopen(request)
print "Loading........."
# get the html page data from http server
data = html_p.read()
html_p.close()
# return the data
print "download page successful"
self.page_url = html_p.geturl()
return data


if __name__ == "__main__":
if (len(sys.argv) == 2):
main(sys.argv[1])

事實上這只是半完成品
首先,下載時漏了判斷目標圖片(一時心急忘記了OTL)
會把最後頁面裏的所有圖片都下載回來
而且寫了很多沒用的function
是測試時候用的
一開始打算用DOM分析HTML文件
但又搞不懂Python怎麼建立DOM
看書又說DOM不適合這情況

還有,我是用了wget來下載
因為一會兒要出去BBQ
趕時間就偷偷懶
所以這些代碼未必能在其他電腦用
現在要出去啦
今天晚上回來再改進
發佈留言

熱門文章