存档

文章标签 ‘ruby’

ruby图片抓取

2011年6月3日 没有评论 1,975 views  

谨以此117行代码,来怀念一下我曾经的ruby生活。

这是一个用来抓取图片的脚本

其中F2010MEN.txt和S2011MEN.txt的内容格式为

“/fashionshows/review/S2011MEN-ACNE”,”/fashionshows/review/S2011MEN-AKIMMEL”。。。

require 'rubygems'
require 'pp'
require 'nokogiri'
require 'open-uri'
require 'fileutils'
require 'timeout'

class GetImages
def initialize
@file_names = ['F2010MEN.txt','S2011MEN.txt']
@api = "http://www.style.com/api/data-streamer?id=empty&out=xml&dsType=slideshow&season=<SEASON>&designCode=<DESIGNCODE>"
@home = "http://www.style.com"
end
def start
@file_names.each do |name|
pages = getpages(name)
pages[0] =~ /([\d\w]+)\-(\w+)/
season = $1						#得到SHOW的名字
bindex = 0
bundle = pages.size-1
FileUtils.mkdir_p("pages/api/#{season}")
pages[bundle * bindex, bundle].each do |p_url|
    begin
    a_url = ''
#    a_uri = ''
    if p_url=~/([\d\w]+)\-(\w+)/
    a, b = $1, $2
    a_url = @api.sub("<SEASON>", a).sub("<DESIGNCODE>", b)
    a_uri = "pages/api/#{a}/" + make_uri(a_url)
    puts a_url
#    puts a_uri
       
    # 1
    download_xml(a_url, a_uri)
    
    # 2
    download_images_from_api_page(a_uri, a, b)
  end
  rescue Timeout::Error, Errno::ENOENT
	    file = open("exception.txt","a")
	    file.write(Time.now.to_s+"no such file:"+a_url+"\n")
	    file.close
	    p_url = ""
	    retry
  end
end
end
end
def getpages(name)
	pages = []
	brandfile = File.open("menswear\\#{name}")	#打开每个品牌的文件
	pages = brandfile.read.split(',')		#得到pages数组
	brandfile.close
	return pages
end
def full_path(url)
  if url =~ /^http/
    url
  else
    @home + url
  end
end
def make_uri(url)
  url.gsub(/[\W\=\?\&]/, '_')
end
def download_image(a_url, a_uri)
	file = File.open("#{a_uri}","wb")
	file.write(open(a_url).read)
	file.close
end
def download_xml(a_url, a_uri)
	file = File.open("#{a_uri}","w")
	file.write(open(a_url).read)
	file.close
end
def setreadme(dir,doc)
	contents = ["主题: #{doc.css('fashion-show').text}","时间:#{doc.css('fashion-show').attr('show_date')}","城市:#{doc.css('slideshow').attr('city')}","设计师:#{doc.css('designer').text}","内容:#{doc.css('body').text}"]
 	readme = File.open("#{dir}/README.txt","w")
	contents.each do |content|
		readme.write(content+"\n")
	end
	readme.close
end
def download_images_from_api_page(a_uri, a, b)
  dir = "images/#{a}/#{b}"
  FileUtils.mkdir_p(dir)
  f = File.new(a_uri)
  doc = Nokogiri::XML(f)
  f.close
  deru = ""
  imagecount = 0
  setreadme(dir,doc)
  readme = File.open("#{dir}/README.txt","a")
  doc.css('photo[type="FULL_SCREEN_IMAGE"]').each do |link|
    puts image_url = full_path(link.attr('uri'))
    if image_url =~ /DETAILS/
	    deru = "DETAILS_"
    else
	    deru = "RUNWAY_"
    end
    puts image_uri = dir + '/' + deru + File.basename(image_url)
    if File.exists?(image_uri)
      puts "We got it!"
    else
      puts "Downloading..."
      download_image(image_url, image_uri)
      imagecount+=1
      readme.write(image_url+"\n")
    end
  end
  readme.write("图片总数: #{imagecount}")
  readme.close
end
end

get_images = GetImages.new
get_images.start
分类: ruby 标签: