SAMPLE_SIZE:=0.01

unzip()
  unzip -p $INPUT > $OUTPUT

add_headers_and_unzip()
  cp headers.csv $OUTPUT
  unzip -p $INPUT | perl -p -e 's/na| //gi' >> $OUTPUT

add_headers_and_7unzip()
  cp headers.csv $OUTPUT
  7z x $INPUT -so | perl -p -e 's/na| //gi' >> $OUTPUT

sample_keep_headers() [python]
  import random
  input = open("$[INPUT]", 'r')
  output = open("$[OUTPUT]", 'w')
  sample_size = $[SAMPLE_SIZE]
  headers_written = False
  for line in input:
    if not headers_written:
      headers_written = True
      output.write(line)
    elif random.random() < sample_size:
      output.write(line)
  input.close()
  output.close()

add_fips_column() [python]
  import csv
  with open("$[INPUT]", 'rb') as input:
    with open("$[OUTPUT]", 'wb') as output:
      csvin = csv.reader(input)
      csvout = csv.writer(output)
      headers = csvin.next()
      headers.append('fips_code')
      csvout.writerow(headers)
      for row in csvin:
        fips_codes = row[11:13]
        row.append(''.join(fips_codes))
        csvout.writerow(row)

convert_ts_to_csv() [python]
  import csv
  fields = ('activity_year', 'respondent_id', 'agency_code', 'federal_tax_id', 'respondent_name', 'respondent_address', 'respondent_city', 'respondent_state', 'respondent_zip_code', 'parent_name', 'parent_address', 'parent_city', 'parent_state', 'parent_zip_code', 'respondent_name_panel', 'respondent_city_panel', 'respondent_state_panel', 'assets', 'other_lender_code', 'region_code', 'lar_count', 'validity_error')
  with open("$[INPUT]", 'rb') as input:
    with open("$[OUTPUT]", 'wb') as output:
      csvin = csv.reader(input, dialect='excel-tab')
      csvout = csv.writer(output)
      csvout.writerow(fields)
      for row in csvin:
        csvout.writerow(row)

link()
  ln -f $INPUT $OUTPUT

gaz_counties.zip <- [-timecheck]
  curl http://www.census.gov/geo/maps-data/data/docs/gazetteer/Gaz_counties_national.zip > $OUTPUT

gaz_counties.csv <- gaz_counties.zip
  unzip -p $INPUT | iconv -f ISO-8859-2 -t UTF-8 | perl -lpe 's/\s*$//;s/\t/,/g' > $OUTPUT

hmda_lar_2012.zip, hmda_lar_2011.zip, hmda_lar_2010.zip, hmda_lar_2009.zip, hmda_lar_2008.zip, hmda_lar_2007.zip <- [-timecheck]
  curl http://www.ffiec.gov/hmdarawdata/LAR/National/2012HMDALAR%20-%20National.zip > $OUTPUT0
  curl http://www.ffiec.gov/hmdarawdata/LAR/National/2011HMDALAR%20-%20National.zip > $OUTPUT1
  curl http://www.ffiec.gov/hmdarawdata/LAR/National/2010HMDALAR%20-%20National.zip > $OUTPUT2
  curl http://www.ffiec.gov/hmdarawdata/LAR/National/2009HMDALAR%20-%20National.zip > $OUTPUT3
  curl http://www.ffiec.gov/hmdarawdata/LAR/National/2008HMDALAR%20-%20National.zip > $OUTPUT4
  curl http://www.ffiec.gov/hmdarawdata/LAR/National/2007HMDALAR%20-%20National.zip > $OUTPUT5

hmda_lar_all_2012_no_fips.csv <- hmda_lar_2012.zip [method:add_headers_and_unzip]
hmda_lar_all_2011_no_fips.csv <- hmda_lar_2011.zip [method:add_headers_and_unzip]
hmda_lar_all_2010_no_fips.csv <- hmda_lar_2010.zip [method:add_headers_and_unzip]
hmda_lar_all_2009_no_fips.csv <- hmda_lar_2009.zip [method:add_headers_and_unzip]
hmda_lar_all_2008_no_fips.csv <- hmda_lar_2008.zip [method:add_headers_and_unzip]
hmda_lar_all_2007_no_fips.csv <- hmda_lar_2007.zip [method:add_headers_and_7unzip]

hmda_lar_all_2012.csv <- hmda_lar_all_2012_no_fips.csv [method:add_fips_column]
hmda_lar_all_2011.csv <- hmda_lar_all_2011_no_fips.csv [method:add_fips_column]
hmda_lar_all_2010.csv <- hmda_lar_all_2010_no_fips.csv [method:add_fips_column]
hmda_lar_all_2009.csv <- hmda_lar_all_2009_no_fips.csv [method:add_fips_column]
hmda_lar_all_2008.csv <- hmda_lar_all_2008_no_fips.csv [method:add_fips_column]
hmda_lar_all_2007.csv <- hmda_lar_all_2007_no_fips.csv [method:add_fips_column]

hmda_lar_sample_2012.csv <- hmda_lar_all_2012.csv [method:sample_keep_headers]
hmda_lar_sample_2011.csv <- hmda_lar_all_2011.csv [method:sample_keep_headers]
hmda_lar_sample_2010.csv <- hmda_lar_all_2010.csv [method:sample_keep_headers]
hmda_lar_sample_2009.csv <- hmda_lar_all_2009.csv [method:sample_keep_headers]
hmda_lar_sample_2008.csv <- hmda_lar_all_2008.csv [method:sample_keep_headers]
hmda_lar_sample_2007.csv <- hmda_lar_all_2007.csv [method:sample_keep_headers]

hmda_lar_2012.csv <- hmda_lar_sample_2012.csv [method:link]
hmda_lar_2011.csv <- hmda_lar_sample_2011.csv [method:link]
hmda_lar_2010.csv <- hmda_lar_sample_2010.csv [method:link]
hmda_lar_2009.csv <- hmda_lar_sample_2009.csv [method:link]
hmda_lar_2008.csv <- hmda_lar_sample_2008.csv [method:link]
hmda_lar_2007.csv <- hmda_lar_sample_2007.csv [method:link]

hmda_ts_2012.zip, hmda_ts_2011.zip, hmda_ts_2010.zip, hmda_ts_2009.zip, hmda_ts_2008.zip, hmda_ts_2007.zip <- [-timecheck]
  curl http://www.ffiec.gov/hmdarawdata/OTHER/2012HMDAInstitutionRecords.zip > $OUTPUT0
  curl http://www.ffiec.gov/hmdarawdata/OTHER/2011HMDAInstitutionRecords.zip > $OUTPUT1
  curl http://www.ffiec.gov/hmdarawdata/OTHER/2010HMDAInstitutionRecords.zip > $OUTPUT2
  curl http://www.ffiec.gov/hmdarawdata/OTHER/2009HMDAInstitutionRecords.zip > $OUTPUT3
  curl http://www.ffiec.gov/hmdarawdata/OTHER/2008HMDAInstitutionRecords.zip > $OUTPUT4
  curl http://www.ffiec.gov/hmdarawdata/OTHER/2007HMDAInstitutionRecords.zip > $OUTPUT5

hmda_ts_2012.txt <- hmda_ts_2012.zip [method:unzip]
hmda_ts_2011.txt <- hmda_ts_2011.zip [method:unzip]
hmda_ts_2010.txt <- hmda_ts_2010.zip [method:unzip]
hmda_ts_2009.txt <- hmda_ts_2009.zip [method:unzip]
hmda_ts_2008.txt <- hmda_ts_2008.zip [method:unzip]
hmda_ts_2007.txt <- hmda_ts_2007.zip [method:unzip]

hmda_ts_2012.csv <- hmda_ts_2012.txt [method:convert_ts_to_csv]
hmda_ts_2011.csv <- hmda_ts_2011.txt [method:convert_ts_to_csv]
hmda_ts_2010.csv <- hmda_ts_2010.txt [method:convert_ts_to_csv]
hmda_ts_2009.csv <- hmda_ts_2009.txt [method:convert_ts_to_csv]
hmda_ts_2008.csv <- hmda_ts_2008.txt [method:convert_ts_to_csv]
hmda_ts_2007.csv <- hmda_ts_2007.txt [method:convert_ts_to_csv]
