# ruby_arXiv_parsing_example.rb
#
# This sample script illustrates a basic arXiv api call
# followed by parsing of the results using the ruby
# feedtools module.
#
# Please see the documentation at 
# http://export.arxiv.org/api_help/docs/user-manual.html
# for more information, or email the arXiv api 
# mailing list at arxiv-api@googlegroups.com.
#
# net/http and uri are part of the default ruby distribution
# ruby gems can be gotten from http://www.rubygems.org/
# feedtools can be gotten via
#  gem install feedtools
#
# Author: Julius B. Lucks
#
# This is free software.  Feel free to do what you want
# with it, but please play nice with the arXiv API!

require 'RubyGems'
require 'feed_tools'
require 'net/http'
require 'uri'

# Feedtools will automatically sort the entries by date
# set :entry_sorting_property to nil to instead leave the
# default entrty order
FeedTools.configurations[:entry_sorting_property] = nil

# Base api query url
base_url = 'http://export.arxiv.org/api/query?' 

# Search parameters
search_query = 'all:electron' # search for electron in all fields
start = 0 # retreive the first 5 results
max_results = 5

# Construct the query with the search parameters
query = "search_query=#{search_query}&start=#{start}&max_results=#{max_results}"

# perform a GET request and parse the feed all in one go
feed = FeedTools::Feed.open(base_url+query)

puts 'Feed title: %s' % feed.title
puts 'Feed last updated: %s' % feed.updated

# opensearch metadata such as totalResults, startIndex, 
# and itemsPerPage live in the opensearch namespase
puts 'totalResults for this query: %s' % feed.find_node('opensearch:totalResults').to_a[0]
puts 'startIndex for this query: %s' % feed.find_node('opensearch:startIndex').to_a[0]
puts 'itemsPerPage for this query: %s' % feed.find_node('opensearch:itemsPerPage').to_a[0]

puts

# Run through each entry, and print out information
for item in feed.items
  puts 'e-print metadata'
  
  puts 'arxiv-id: %s' % item.id.split('/abs/')[-1]
  puts 'Published: %s' % item.published
  puts 'Title: %s' % item.title
  
  # FeedTools v0.2.26 only allows one author so  
  # gather a list of authors and affiliation by hand
  #  This is a little complicated due to the fact that the author
  #  affiliations are in the arxiv namespace (if present)  
  puts "Authors: \n"
  authors = item.find_all_nodes('author')
  for author in authors
    
    #name is stored in 1st subelement
    name = author.elements[1].text
    author_string =  "    #{name}"
    
    #affiliations stored in rest
    number_affils = author.elements.size - 1
    
    affils = []
    for i in (1..number_affils)
      affils.push(author.elements[i+1].text)
    end
    
    if affils.size > 0
      author_string += " ("+affils.join(', ')+")"
    end    
    
    puts author_string
  end
  

  
  # get the links to the abs page and pdf for this e-print
  for link in item.links
    if (link.rel == 'alternate')
      puts 'abs page link: %s' % link.href
    elsif (link.title == 'pdf')
      puts 'pdf link: %s' % link.href
    end
  end
  
  # The journal reference, comments and primary_category sections live under 
  # the arxiv namespace
  journal_ref = item.find_all_nodes('journal_ref')[0].to_a[0] || 'No journal ref found'
  puts 'Journal Reference: %s' % journal_ref
  
  comment = item.find_all_nodes('comment')[0].to_a[0] || 'No comment found'
  puts 'Comments: %s' % comment
  
  primary_category = item.find_all_nodes('primary_category')[0].attributes['term']
  puts 'Primary Category: %s' % primary_category
  
  # Lets get all the categories
  all_categories = []
  item.categories.each{|c| all_categories.push(c.term)}
  puts 'All Categories: %s' % all_categories.join(', ')
  
  # The abstract is in the <summary> element  
  puts 'Abstract: %s' % item.summary
  
end