The Mechanize library is used for automating interaction with a website. It can follow links, and submit forms. Form fields can be populated and submitted. A history of URL‘s is maintained and can be queried.
require 'rubygems' require 'mechanize' require 'logger' agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") } agent.user_agent_alias = 'Mac Safari' page = agent.get("http://www.google.com/") search_form = page.form_with(:name => "f") search_form.field_with(:name => "q").value = "Hello" search_results = agent.submit(search_form) puts search_results.body
VERSION | = | '0.8.5' | The version of Mechanize you are using. | |
AGENT_ALIASES | = | { 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6', 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3', 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3', 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401', 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624', 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)', 'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3', 'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)" | User Agent aliases |
redirect_ok | -> | follow_redirect? |
ca_file | [RW] | |
cert | [RW] | |
conditional_requests | [RW] | |
cookie_jar | [RW] | |
follow_meta_refresh | [RW] | |
history | [R] | |
history_added | [RW] | |
html_parser | [RW] | |
keep_alive | [RW] | |
keep_alive_time | [RW] | |
key | [RW] | |
log | [RW] | |
open_timeout | [RW] | |
pass | [RW] | |
pluggable_parser | [R] | |
read_timeout | [RW] | |
redirect_ok | [RW] | |
redirection_limit | [RW] | |
scheme_handlers | [RW] | |
user_agent | [RW] | |
verify_callback | [RW] | |
watch_for_set | [RW] |
# File lib/www/mechanize.rb, line 93 93: def initialize 94: # attr_accessors 95: @cookie_jar = CookieJar.new 96: @log = nil 97: @open_timeout = nil 98: @read_timeout = nil 99: @user_agent = AGENT_ALIASES['Mechanize'] 100: @watch_for_set = nil 101: @history_added = nil 102: @ca_file = nil # OpenSSL server certificate file 103: 104: # callback for OpenSSL errors while verifying the server certificate 105: # chain, can be used for debugging or to ignore errors by always 106: # returning _true_ 107: @verify_callback = nil 108: @cert = nil # OpenSSL Certificate 109: @key = nil # OpenSSL Private Key 110: @pass = nil # OpenSSL Password 111: @redirect_ok = true # Should we follow redirects? 112: 113: # attr_readers 114: @history = WWW::Mechanize::History.new 115: @pluggable_parser = PluggableParser.new 116: 117: # Auth variables 118: @user = nil # Auth User 119: @password = nil # Auth Password 120: @digest = nil # DigestAuth Digest 121: @auth_hash = {} # Keep track of urls for sending auth 122: 123: # Proxy settings 124: @proxy_addr = nil 125: @proxy_pass = nil 126: @proxy_port = nil 127: @proxy_user = nil 128: 129: @conditional_requests = true 130: 131: @follow_meta_refresh = false 132: @redirection_limit = 20 133: 134: # Connection Cache & Keep alive 135: @connection_cache = {} 136: @keep_alive_time = 300 137: @keep_alive = true 138: 139: @scheme_handlers = Hash.new { |h,k| 140: h[k] = lambda { |link, page| 141: raise UnsupportedSchemeError.new(k) 142: } 143: } 144: @scheme_handlers['http'] = lambda { |link, page| link } 145: @scheme_handlers['https'] = @scheme_handlers['http'] 146: @scheme_handlers['relative'] = @scheme_handlers['http'] 147: @scheme_handlers['file'] = @scheme_handlers['http'] 148: 149: @pre_connect_hook = Chain::PreConnectHook.new 150: @post_connect_hook = Chain::PostConnectHook.new 151: 152: yield self if block_given? 153: end
Sets the user and password to be used for authentication.
# File lib/www/mechanize.rb, line 186 186: def auth(user, password) 187: @user = user 188: @password = password 189: end
Clicks the WWW::Mechanize::Link object passed in and returns the page fetched.
# File lib/www/mechanize.rb, line 279 279: def click(link) 280: referer = link.page rescue referer = nil 281: href = link.respond_to?(:href) ? link.href : 282: (link['href'] || link['src']) 283: get(:url => href, :referer => (referer || current_page())) 284: end
DELETE to url with query_params, and setting options:
delete('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
# File lib/www/mechanize.rb, line 259 259: def delete(url, query_params = {}, options = {}) 260: put(url, query_params, options.merge({:verb => :delete})) 261: end
Fetches the URL passed in and returns a page.
# File lib/www/mechanize.rb, line 193 193: def get(options, parameters = [], referer = nil) 194: unless options.is_a? Hash 195: url = options 196: unless parameters.respond_to?(:each) # FIXME: Remove this in 0.8.0 197: referer = parameters 198: parameters = [] 199: end 200: else 201: raise ArgumentError.new("url must be specified") unless url = options[:url] 202: parameters = options[:params] || [] 203: referer = options[:referer] 204: headers = options[:headers] 205: end 206: 207: unless referer 208: if url =~ /^http/ 209: referer = Page.new(nil, {'content-type'=>'text/html'}) 210: else 211: referer = current_page || Page.new(nil, {'content-type'=>'text/html'}) 212: end 213: end 214: 215: # FIXME: Huge hack so that using a URI as a referer works. I need to 216: # refactor everything to pass around URIs but still support 217: # WWW::Mechanize::Page#base 218: unless referer.is_a?(WWW::Mechanize::File) 219: referer = referer.is_a?(String) ? 220: Page.new(URI.parse(referer), {'content-type' => 'text/html'}) : 221: Page.new(referer, {'content-type' => 'text/html'}) 222: end 223: 224: # fetch the page 225: page = fetch_page( :uri => url, 226: :referer => referer, 227: :headers => headers || {}, 228: :params => parameters 229: ) 230: add_to_history(page) 231: yield page if block_given? 232: page 233: end
Fetch a file and return the contents of the file.
# File lib/www/mechanize.rb, line 273 273: def get_file(url) 274: get(url).body 275: end
HEAD to url with query_params, and setting options:
head('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
# File lib/www/mechanize.rb, line 268 268: def head(url, query_params = {}, options = {}) 269: put(url, query_params, options.merge({:verb => :head})) 270: end
Posts to the given URL wht the query parameters passed in. Query parameters can be passed as a hash, or as an array of arrays. Example:
agent.post('http://example.com/', "foo" => "bar")
or
agent.post('http://example.com/', [ ["foo", "bar"] ])
# File lib/www/mechanize.rb, line 298 298: def post(url, query={}) 299: node = {} 300: # Create a fake form 301: class << node 302: def search(*args); []; end 303: end 304: node['method'] = 'POST' 305: node['enctype'] = 'application/x-www-form-urlencoded' 306: 307: form = Form.new(node) 308: query.each { |k,v| 309: if v.is_a?(IO) 310: form.enctype = 'multipart/form-data' 311: ul = Form::FileUpload.new(k.to_s,::File.basename(v.path)) 312: ul.file_data = v.read 313: form.file_uploads << ul 314: else 315: form.fields << Form::Field.new(k.to_s,v) 316: end 317: } 318: post_form(url, form) 319: end
# File lib/www/mechanize.rb, line 164 164: def post_connect_hooks 165: @post_connect_hook.hooks 166: end
# File lib/www/mechanize.rb, line 160 160: def pre_connect_hooks 161: @pre_connect_hook.hooks 162: end
PUT to url with query_params, and setting options:
put('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
# File lib/www/mechanize.rb, line 240 240: def put(url, query_params = {}, options = {}) 241: options = { 242: :uri => url, 243: :headers => {}, 244: :params => query_params, 245: :verb => :put 246: }.merge(options) 247: # fetch the page 248: page = fetch_page(options) 249: add_to_history(page) 250: yield page if block_given? 251: page 252: end
Sets the proxy address, port, user, and password addr should be a host, with no "http://"
# File lib/www/mechanize.rb, line 170 170: def set_proxy(addr, port, user = nil, pass = nil) 171: @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass 172: end
Submit a form with an optional button. Without a button:
page = agent.get('http://example.com') agent.submit(page.forms.first)
With a button
agent.submit(page.forms.first, page.forms.first.buttons.first)
# File lib/www/mechanize.rb, line 327 327: def submit(form, button=nil) 328: form.add_button_to_query(button) if button 329: case form.method.upcase 330: when 'POST' 331: post_form(form.action, form) 332: when 'GET' 333: get( :url => form.action.gsub(/\?[^\?]*$/, ''), 334: :params => form.build_query, 335: :referer => form.page 336: ) 337: else 338: raise "unsupported method: #{form.method.upcase}" 339: end 340: end
Runs given block, then resets the page history as it was before. self is given as a parameter to the block. Returns the value of the block.
# File lib/www/mechanize.rb, line 362 362: def transact 363: history_backup = @history.dup 364: begin 365: yield self 366: ensure 367: @history = history_backup 368: end 369: end
Returns whether or not a url has been visited
# File lib/www/mechanize.rb, line 348 348: def visited?(url) 349: ! visited_page(url).nil? 350: end
# File lib/www/mechanize.rb, line 568 568: def add_to_history(page) 569: @history.push(page, resolve(page.uri)) 570: history_added.call(page) if history_added 571: end
uri is an absolute URI
# File lib/www/mechanize.rb, line 405 405: def fetch_page(params) 406: options = { 407: :request => nil, 408: :response => nil, 409: :connection => nil, 410: :referer => current_page(), 411: :uri => nil, 412: :verb => :get, 413: :agent => self, 414: :redirects => 0, 415: :params => [], 416: :headers => {}, 417: }.merge(params) 418: 419: before_connect = Chain.new([ 420: Chain::URIResolver.new(@scheme_handlers), 421: Chain::ParameterResolver.new, 422: Chain::RequestResolver.new, 423: Chain::ConnectionResolver.new( 424: @connection_cache, 425: @keep_alive, 426: @proxy_addr, 427: @proxy_port, 428: @proxy_user, 429: @proxy_pass 430: ), 431: Chain::SSLResolver.new(@ca_file, @verify_callback, @cert, @key, @pass), 432: Chain::AuthHeaders.new(@auth_hash, @user, @password, @digest), 433: Chain::HeaderResolver.new( @keep_alive, 434: @keep_alive_time, 435: @cookie_jar, 436: @user_agent), 437: Chain::CustomHeaders.new, 438: @pre_connect_hook, 439: ]) 440: before_connect.handle(options) 441: 442: uri = options[:uri] 443: request = options[:request] 444: cur_page = options[:referer] 445: request_data = options[:params] 446: redirects = options[:redirects] 447: http_obj = options[:connection] 448: 449: # Add If-Modified-Since if page is in history 450: if( (page = visited_page(uri)) && page.response['Last-Modified'] ) 451: request['If-Modified-Since'] = page.response['Last-Modified'] 452: end if(@conditional_requests) 453: 454: # Specify timeouts if given 455: http_obj.open_timeout = @open_timeout if @open_timeout 456: http_obj.read_timeout = @read_timeout if @read_timeout 457: http_obj.start unless http_obj.started? 458: 459: # Log specified headers for the request 460: log.info("#{ request.class }: #{ request.path }") if log 461: request.each_header do |k, v| 462: log.debug("request-header: #{ k } => #{ v }") 463: end if log 464: 465: # Send the request 466: attempts = 0 467: begin 468: response = http_obj.request(request, *request_data) { |r| 469: connection_chain = Chain.new([ 470: Chain::ResponseReader.new(r), 471: Chain::BodyDecodingHandler.new, 472: ]) 473: connection_chain.handle(options) 474: } 475: rescue EOFError, Errno::ECONNRESET, Errno::EPIPE => x 476: log.error("Rescuing EOF error") if log 477: http_obj.finish 478: raise x if attempts >= 2 479: request.body = nil 480: http_obj.start 481: attempts += 1 482: retry 483: end 484: 485: after_connect = Chain.new([ 486: @post_connect_hook, 487: Chain::ResponseBodyParser.new(@pluggable_parser, @watch_for_set), 488: Chain::ResponseHeaderHandler.new(@cookie_jar, @connection_cache), 489: ]) 490: after_connect.handle(options) 491: 492: res_klass = options[:res_klass] 493: response_body = options[:response_body] 494: page = options[:page] 495: 496: log.info("status: #{ page.code }") if log 497: 498: if follow_meta_refresh 499: redirect_uri = nil 500: if (page.respond_to?(:meta) && (redirect = page.meta.first)) 501: redirect_uri = redirect.uri.to_s 502: elsif refresh = response['refresh'] 503: parsed_refresh = refresh.match(/^\s*(\d+\.?\d*);\s*(url|URL)=(\S*)\s*$/) 504: raise StandardError, "Invalid refresh http header" unless parsed_refresh 505: delay = parsed_refresh[1] 506: location = parsed_refresh[3] 507: location = "http://#{uri.host}#{location}" unless location.include?("http") 508: if redirects + 1 > redirection_limit 509: raise RedirectLimitReachedError.new(page, redirects) 510: end 511: sleep delay.to_i 512: redirect_uri = location 513: end 514: if redirect_uri 515: @history.push(page, page.uri) 516: return fetch_page( 517: :uri => redirect_uri, 518: :referer => page, 519: :params => [], 520: :verb => :get, 521: :redirects => redirects + 1 522: ) 523: end 524: end 525: 526: return page if res_klass <= Net::HTTPSuccess 527: 528: if res_klass == Net::HTTPNotModified 529: log.debug("Got cached page") if log 530: return visited_page(uri) || page 531: elsif res_klass <= Net::HTTPRedirection 532: return page unless follow_redirect? 533: log.info("follow redirect to: #{ response['Location'] }") if log 534: from_uri = page.uri 535: raise RedirectLimitReachedError.new(page, redirects) if redirects + 1 > redirection_limit 536: redirect_verb = options[:verb] == :head ? :head : :get 537: page = fetch_page( :uri => response['Location'].to_s, 538: :referer => page, 539: :params => [], 540: :verb => redirect_verb, 541: :redirects => redirects + 1 542: ) 543: @history.push(page, from_uri) 544: return page 545: elsif res_klass <= Net::HTTPUnauthorized 546: raise ResponseCodeError.new(page) unless @user || @password 547: raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host) 548: if response['www-authenticate'] =~ /Digest/i 549: @auth_hash[uri.host] = :digest 550: if response['server'] =~ /Microsoft-IIS/ 551: @auth_hash[uri.host] = :iis_digest 552: end 553: @digest = response['www-authenticate'] 554: else 555: @auth_hash[uri.host] = :basic 556: end 557: return fetch_page( :uri => uri, 558: :referer => cur_page, 559: :verb => request.method.downcase.to_sym, 560: :params => request_data, 561: :headers => options[:headers] 562: ) 563: end 564: 565: raise ResponseCodeError.new(page), "Unhandled response", caller 566: end
# File lib/www/mechanize.rb, line 383 383: def post_form(url, form) 384: cur_page = form.page || current_page || 385: Page.new( nil, {'content-type'=>'text/html'}) 386: 387: request_data = form.request_data 388: 389: log.debug("query: #{ request_data.inspect }") if log 390: 391: # fetch the page 392: page = fetch_page( :uri => url, 393: :referer => cur_page, 394: :verb => :post, 395: :params => [request_data], 396: :headers => { 397: 'Content-Type' => form.enctype, 398: 'Content-Length' => request_data.size.to_s, 399: }) 400: add_to_history(page) 401: page 402: end