ruby图片抓取
谨以此117行代码,来怀念一下我曾经的ruby生活。
这是一个用来抓取图片的脚本
其中F2010MEN.txt和S2011MEN.txt的内容格式为
“/fashionshows/review/S2011MEN-ACNE”,”/fashionshows/review/S2011MEN-AKIMMEL”。。。
require 'rubygems' require 'pp' require 'nokogiri' require 'open-uri' require 'fileutils' require 'timeout' class GetImages def initialize @file_names = ['F2010MEN.txt','S2011MEN.txt'] @api = "http://www.style.com/api/data-streamer?id=empty&out=xml&dsType=slideshow&season=<SEASON>&designCode=<DESIGNCODE>" @home = "http://www.style.com" end def start @file_names.each do |name| pages = getpages(name) pages[0] =~ /([\d\w]+)\-(\w+)/ season = $1 #得到SHOW的名字 bindex = 0 bundle = pages.size-1 FileUtils.mkdir_p("pages/api/#{season}") pages[bundle * bindex, bundle].each do |p_url| begin a_url = '' # a_uri = '' if p_url=~/([\d\w]+)\-(\w+)/ a, b = $1, $2 a_url = @api.sub("<SEASON>", a).sub("<DESIGNCODE>", b) a_uri = "pages/api/#{a}/" + make_uri(a_url) puts a_url # puts a_uri # 1 download_xml(a_url, a_uri) # 2 download_images_from_api_page(a_uri, a, b) end rescue Timeout::Error, Errno::ENOENT file = open("exception.txt","a") file.write(Time.now.to_s+"no such file:"+a_url+"\n") file.close p_url = "" retry end end end end def getpages(name) pages = [] brandfile = File.open("menswear\\#{name}") #打开每个品牌的文件 pages = brandfile.read.split(',') #得到pages数组 brandfile.close return pages end def full_path(url) if url =~ /^http/ url else @home + url end end def make_uri(url) url.gsub(/[\W\=\?\&]/, '_') end def download_image(a_url, a_uri) file = File.open("#{a_uri}","wb") file.write(open(a_url).read) file.close end def download_xml(a_url, a_uri) file = File.open("#{a_uri}","w") file.write(open(a_url).read) file.close end def setreadme(dir,doc) contents = ["主题: #{doc.css('fashion-show').text}","时间:#{doc.css('fashion-show').attr('show_date')}","城市:#{doc.css('slideshow').attr('city')}","设计师:#{doc.css('designer').text}","内容:#{doc.css('body').text}"] readme = File.open("#{dir}/README.txt","w") contents.each do |content| readme.write(content+"\n") end readme.close end def download_images_from_api_page(a_uri, a, b) dir = "images/#{a}/#{b}" FileUtils.mkdir_p(dir) f = File.new(a_uri) doc = Nokogiri::XML(f) f.close deru = "" imagecount = 0 setreadme(dir,doc) readme = File.open("#{dir}/README.txt","a") doc.css('photo[type="FULL_SCREEN_IMAGE"]').each do |link| puts image_url = full_path(link.attr('uri')) if image_url =~ /DETAILS/ deru = "DETAILS_" else deru = "RUNWAY_" end puts image_uri = dir + '/' + deru + File.basename(image_url) if File.exists?(image_uri) puts "We got it!" else puts "Downloading..." download_image(image_url, image_uri) imagecount+=1 readme.write(image_url+"\n") end end readme.write("图片总数: #{imagecount}") readme.close end end get_images = GetImages.new get_images.start