thecoderpanda
diff --git a/‎Makefile
Lines changed: 14 additions & 0 deletions b/‎Makefile
Lines changed: 14 additions & 0 deletions
diff --git a/‎README
Lines changed: 5 additions & 2 deletions b/‎README
Lines changed: 5 additions & 2 deletions
diff --git a/‎clojure_examples/README
Lines changed: 10 additions & 6 deletions b/‎clojure_examples/README
Lines changed: 10 additions & 6 deletions
diff --git a/‎google_book_ngram_data/README.txt
Lines changed: 27 additions & 0 deletions b/‎google_book_ngram_data/README.txt
Lines changed: 27 additions & 0 deletions
diff --git a/‎google_book_ngram_data/best_ngrams.rb
Lines changed: 55 additions & 0 deletions b/‎google_book_ngram_data/best_ngrams.rb
Lines changed: 55 additions & 0 deletions
@@ -1,3 +1,17 @@
 clean:
 	rm -r -f out
 	(cd clojure_examples; lein clean; rm -r -f lib/* classes)
+
+mapreduce_example:
+	rm -r -f mr_temp
+	mkdir -p mr_temp/nlp/com/knowledgebooks/mapreduce
+	mkdir -p mr_temp/nlp/com/knowledgebooks/nlp/util
+	cp src/nlp/com/knowledgebooks/mapreduce/NameFinder.java mr_temp/nlp/com/knowledgebooks/mapreduce/
+	cp src/nlp/com/knowledgebooks/nlp/util/ScoredList.java mr_temp/nlp/com/knowledgebooks/nlp/util/
+	cp src/nlp/com/knowledgebooks/nlp/util/Tokenizer.java mr_temp/nlp/com/knowledgebooks/nlp/util/
+	cp src/nlp/com/knowledgebooks/nlp/ExtractNames.java mr_temp/nlp/com/knowledgebooks/nlp/
+	mkdir -p mr_temp/test_data
+	cp test_data/propername.ser mr_temp/test_data/
+	(cd mr_temp; jar xvf ../lib/hadoop-core-1.1.2.jar)
+	(cd mr_temp; javac nlp/com/knowledgebooks/mapreduce/NameFinder.java)
+	(cd mr_temp; jar cvf ../namefinder.jar .)
@@ -1,3 +1,6 @@
-I am currently writing the 4th edition of my "Practical Artificial Intelligence Programming with Java" book.
+# Code examples for the 4th edition of "Practical Artificial Intelligence Programming with Java"
 
-This git repo currently has the code examples for the 3rd edition and over the next few months they will be updated for the 4th edition.
+All code examples can be used either under the LGPL version 3 license or the Apache 2 license.
+
+You can buy a copy of the book (includes PDF, Kindle, and iPad/iPhone formats
+at
@@ -1,13 +1,17 @@
-# clojure_examples
+# clojure_examples for the book "Practical Artificial Intelligence Programming with Java"
 
-FIXME: write description
+This directory contains Clojure wrappers for some of the Java example programs in the book.
 
-## Usage
+## Getting started with the Clojure examples
 
-FIXME: write
+The easiest way to make sure everything is set up to run correctly is to try:
+
+lein test
+
+in order to run the unit tests. The source code for the unit tests show how to call the Clojure wrappers.
 
 ## License
 
-Copyright (C) 2012 FIXME
+Copyright (C) 2012 Mark Watson
 
-Distributed under the Eclipse Public License, the same as Clojure.
+Distributed under both the LGPL 3.0 and the Apache 2 licenses - pick the license that works best for you..
@@ -0,0 +1,27 @@
+# This directory accompanies the Chapter on Data Science
+
+You will need to run the script best_ngrams.rb 5 times, setting the variable $$match$$ to:
+
+    1gram
+    2gram
+    3gram
+    4gram
+    5gram
+
+And Adjusting the value of $$CUTOFF$$.
+
+Also, on the leased Linux server I used, I was putting the best ngram data (best in the sense that I only kept ngrams with a use count greater than $$CUTOFF$$) in my home directory "/home/markw" - you will want to change the target directory for your system.
+
+~~~~~~~~
+match = "3gram"
+CUTOFF = 500
+
+$words = "====="
+$count = 0
+
+$out = File.new("/home/markw/#{match}.txt", 'w')
+
+File.new("ngrams_uris.txt").lines.each do |line|
+  if line.index("<a href='") && line.index(match)
+~~~~~~~~
+
@@ -0,0 +1,55 @@
+# new: gunzip files one at a time:
+
+match = "3gram"
+CUTOFF = 500
+
+$words = "====="
+$count = 0
+
+$out = File.new("/home/markw/#{match}.txt", 'w')
+
+File.new("ngrams_uris.txt").lines.each do |line|
+  if line.index("<a href='") && line.index(match)
+    uri = line[9...line.index("'>")]
+    puts "|#{uri}|"
+    `wget #{uri}`
+    sleep 60
+
+    Dir.entries(".").each do |fn|
+      if fn.index(match) && fn.index(".gz")
+        file_root = fn[0..-4]
+        puts `gunzip #{fn}`
+        sleep 20
+        puts `ls -lh #{file_root}*`
+        count = 0
+        File.new(file_root).each_line.each do |line|
+          count += 1
+          tokens = line.split("\t")
+          if tokens[1].size > 0
+            #puts tokens.join("|")
+            words = tokens[0].downcase.split.collect do |w|
+              index = w.index("_")
+              if w.length<2 || w[0]=="_" || w[0]=="(" || w[0]==")" || w[0]=="." || w[0]=="'"
+                "^"
+              elsif index
+                w[0...index]
+              else
+                w
+              end 
+            end.join(' ')
+            if $words == words
+              $count += tokens[2].to_i if !words.index("^")
+            else
+              $out.puts "#{$words}\t#{$count}" if $words != "=====" && !$words.index("^") && $count > 20 && !$words.index(",") && !$words.index(".")  && !$words.index(";") && !$words.index(":") && !$words.index("!")  && $words[0]!="0" && $words[0].to_i==0 if $count > CUTOFF
+              $words = words
+              $count = tokens[2].to_i
+            end
+          end
+        end
+        puts "count=#{count} for #{file_root}"
+        puts `rm -r -f  *#{file_root}*`
+      end
+    end
+  end
+end
+$out.close