From 76c11e5efa7fccf3dfb42b1d7d0f6fa5245f5a3f Mon Sep 17 00:00:00 2001 From: Manpreet Singh Date: Fri, 22 May 2020 03:10:15 -0700 Subject: [PATCH 01/11] Save counters to local driver path if temporary directory is not given --- mrjob/bin.py | 3 +-- mrjob/spark/harness.py | 24 +++++++++++++++++------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/mrjob/bin.py b/mrjob/bin.py index 4584ff721..2a1ede587 100644 --- a/mrjob/bin.py +++ b/mrjob/bin.py @@ -845,6 +845,7 @@ def _run_spark_submit(self, spark_submit_args, env, record_callback): else: # we have PTYs if pid == 0: # we are the child process + log.debug('Invoking spark-submit via PTY') try: os.execvpe(spark_submit_args[0], spark_submit_args, env) # now this process is no longer Python @@ -856,8 +857,6 @@ def _run_spark_submit(self, spark_submit_args, env, record_callback): # if we get some other exception, still exit hard os._exit(-1) else: - log.debug('Invoking spark-submit via PTY') - with os.fdopen(master_fd, 'rb') as master: step_interpretation = ( _parse_spark_log( diff --git a/mrjob/spark/harness.py b/mrjob/spark/harness.py index 934776eae..ea4b5e797 100644 --- a/mrjob/spark/harness.py +++ b/mrjob/spark/harness.py @@ -281,13 +281,23 @@ def make_mrc_job(mrc, step_num): if args.counter_output_dir is not None: counters = [ca.value for ca in counter_accumulators] - sc.parallelize( - [json.dumps(counters)], - numSlices=1 - ).saveAsTextFile( - args.counter_output_dir - ) - + # If the given path is an s3 path, use s3.parallelize, + # otherwise just write them directly to the local dir + if args.counter_output_dir.startswith("s3://") or args.counter_output_dir.startswith("s3a://"): + sc.parallelize( + [json.dumps(counters)], + numSlices=1 + ).saveAsTextFile( + args.counter_output_dir + ) + else: + path = args.counter_output_dir + "/part-0000" + if not os.path.exists(args.counter_output_dir): + os.mkdir(args.counter_output_dir) + with open(path, 'w') as wb: + wb.write(str(json.dumps(counters))) + with open(path, 'rb') as rb: + print(rb.read()) def _text_file_with_path(sc, path): """Return an RDD that yields (path, line) for each line in the file. From f641344f98a26f23a77c878e1a26955d9b22ff91 Mon Sep 17 00:00:00 2001 From: Manpreet Singh Date: Fri, 22 May 2020 10:13:18 -0700 Subject: [PATCH 02/11] fix tests --- mrjob/spark/harness.py | 17 +++++++---------- tests/spark/test_harness.py | 5 +++-- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/mrjob/spark/harness.py b/mrjob/spark/harness.py index ea4b5e797..0b8485ae4 100644 --- a/mrjob/spark/harness.py +++ b/mrjob/spark/harness.py @@ -283,21 +283,18 @@ def make_mrc_job(mrc, step_num): # If the given path is an s3 path, use s3.parallelize, # otherwise just write them directly to the local dir - if args.counter_output_dir.startswith("s3://") or args.counter_output_dir.startswith("s3a://"): - sc.parallelize( - [json.dumps(counters)], - numSlices=1 - ).saveAsTextFile( - args.counter_output_dir - ) - else: + sc.parallelize( + [json.dumps(counters)], + numSlices=1 + ).saveAsTextFile( + args.counter_output_dir + ) + if not (args.counter_output_dir.startswith("s3://") or args.counter_output_dir.startswith("s3a://")): path = args.counter_output_dir + "/part-0000" if not os.path.exists(args.counter_output_dir): os.mkdir(args.counter_output_dir) with open(path, 'w') as wb: wb.write(str(json.dumps(counters))) - with open(path, 'rb') as rb: - print(rb.read()) def _text_file_with_path(sc, path): """Return an RDD that yields (path, line) for each line in the file. diff --git a/tests/spark/test_harness.py b/tests/spark/test_harness.py index 70e9dcd56..fd7db1f20 100644 --- a/tests/spark/test_harness.py +++ b/tests/spark/test_harness.py @@ -455,14 +455,15 @@ def test_increment_counter(self): with self.create_temp_counter_dir() as output_counter_dir: harness_job = self._harness_job( MRCountingJob, input_bytes=input_bytes, - counter_output_dir='file://{}'.format(output_counter_dir) + counter_output_dir='{}'.format(output_counter_dir) ) + print('counter output dir', output_counter_dir) with harness_job.make_runner() as runner: runner.run() harness_counters = json.loads( self.spark_context.textFile( - 'file://' + output_counter_dir + output_counter_dir ).collect()[0]) self.assertEqual(harness_counters, reference_counters) From 2ae4efc050d43b2dc0d596224cfc75ef48bd04d9 Mon Sep 17 00:00:00 2001 From: Manpreet Singh Date: Fri, 22 May 2020 10:21:49 -0700 Subject: [PATCH 03/11] use is_uri check --- mrjob/spark/harness.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mrjob/spark/harness.py b/mrjob/spark/harness.py index 0b8485ae4..97d710d59 100644 --- a/mrjob/spark/harness.py +++ b/mrjob/spark/harness.py @@ -20,6 +20,7 @@ from importlib import import_module from itertools import chain +from mrjob.parse import is_uri from mrjob.util import shlex_split from pyspark.accumulators import AccumulatorParam @@ -289,7 +290,7 @@ def make_mrc_job(mrc, step_num): ).saveAsTextFile( args.counter_output_dir ) - if not (args.counter_output_dir.startswith("s3://") or args.counter_output_dir.startswith("s3a://")): + if not is_uri(args.counter_output_dir): path = args.counter_output_dir + "/part-0000" if not os.path.exists(args.counter_output_dir): os.mkdir(args.counter_output_dir) From ea2a40b39cb48628f3b811cecbc141dcfb9c936d Mon Sep 17 00:00:00 2001 From: Manpreet Singh Date: Fri, 22 May 2020 14:01:46 -0700 Subject: [PATCH 04/11] fix tests --- tests/spark/test_harness.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/spark/test_harness.py b/tests/spark/test_harness.py index fd7db1f20..ad5ba232a 100644 --- a/tests/spark/test_harness.py +++ b/tests/spark/test_harness.py @@ -455,7 +455,7 @@ def test_increment_counter(self): with self.create_temp_counter_dir() as output_counter_dir: harness_job = self._harness_job( MRCountingJob, input_bytes=input_bytes, - counter_output_dir='{}'.format(output_counter_dir) + counter_output_dir='file://{}'.format(output_counter_dir) ) print('counter output dir', output_counter_dir) with harness_job.make_runner() as runner: @@ -463,7 +463,7 @@ def test_increment_counter(self): harness_counters = json.loads( self.spark_context.textFile( - output_counter_dir + 'file://' + output_counter_dir ).collect()[0]) self.assertEqual(harness_counters, reference_counters) From 30b5066a9425adf8a7e414e5d06d1e861fb16e21 Mon Sep 17 00:00:00 2001 From: Manpreet Singh Date: Fri, 22 May 2020 14:44:21 -0700 Subject: [PATCH 05/11] fix test --- mrjob/spark/harness.py | 1 + mrjob/spark/runner.py | 1 + 2 files changed, 2 insertions(+) diff --git a/mrjob/spark/harness.py b/mrjob/spark/harness.py index 97d710d59..093e57785 100644 --- a/mrjob/spark/harness.py +++ b/mrjob/spark/harness.py @@ -290,6 +290,7 @@ def make_mrc_job(mrc, step_num): ).saveAsTextFile( args.counter_output_dir ) + print('counter output dir', args.counter_output_dir) if not is_uri(args.counter_output_dir): path = args.counter_output_dir + "/part-0000" if not os.path.exists(args.counter_output_dir): diff --git a/mrjob/spark/runner.py b/mrjob/spark/runner.py index f25f09966..02fa4cd51 100644 --- a/mrjob/spark/runner.py +++ b/mrjob/spark/runner.py @@ -332,6 +332,7 @@ def _run_step_on_spark(self, step, step_num, last_step_num=None): counter_file = self.fs.join( self._counter_output_dir(step_num), 'part-*') counter_json = b''.join(self.fs.cat(counter_file)) + log.info('counters %s' % counter_json) if counter_json.strip(): # json.loads() on Python 3.4/3.5 can't take bytes counters = json.loads(to_unicode(counter_json)) From eb3cfdc52f4c1af81e345e9111b129cc20954c38 Mon Sep 17 00:00:00 2001 From: Manpreet Singh Date: Fri, 22 May 2020 14:45:13 -0700 Subject: [PATCH 06/11] test --- mrjob/spark/harness.py | 1 - tests/spark/test_harness.py | 1 - 2 files changed, 2 deletions(-) diff --git a/mrjob/spark/harness.py b/mrjob/spark/harness.py index 093e57785..97d710d59 100644 --- a/mrjob/spark/harness.py +++ b/mrjob/spark/harness.py @@ -290,7 +290,6 @@ def make_mrc_job(mrc, step_num): ).saveAsTextFile( args.counter_output_dir ) - print('counter output dir', args.counter_output_dir) if not is_uri(args.counter_output_dir): path = args.counter_output_dir + "/part-0000" if not os.path.exists(args.counter_output_dir): diff --git a/tests/spark/test_harness.py b/tests/spark/test_harness.py index ad5ba232a..70e9dcd56 100644 --- a/tests/spark/test_harness.py +++ b/tests/spark/test_harness.py @@ -457,7 +457,6 @@ def test_increment_counter(self): MRCountingJob, input_bytes=input_bytes, counter_output_dir='file://{}'.format(output_counter_dir) ) - print('counter output dir', output_counter_dir) with harness_job.make_runner() as runner: runner.run() From 7dd015da43ccf088e80b344dfa5ef023fbcec304 Mon Sep 17 00:00:00 2001 From: Manpreet Singh Date: Fri, 22 May 2020 17:20:34 -0700 Subject: [PATCH 07/11] fix tests --- mrjob/spark/harness.py | 6 +++++- mrjob/spark/runner.py | 12 +++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/mrjob/spark/harness.py b/mrjob/spark/harness.py index 97d710d59..62e42ab0b 100644 --- a/mrjob/spark/harness.py +++ b/mrjob/spark/harness.py @@ -17,6 +17,7 @@ import json from argparse import ArgumentParser from collections import defaultdict +from glob import glob from importlib import import_module from itertools import chain @@ -290,10 +291,13 @@ def make_mrc_job(mrc, step_num): ).saveAsTextFile( args.counter_output_dir ) - if not is_uri(args.counter_output_dir): + print('find files', glob(args.counter_output_dir+'/*')) + print('find part-files files', glob(args.counter_output_dir+'/part-*')) + if not is_uri(args.counter_output_dir) and not glob(args.counter_output_dir+'/part-*'): path = args.counter_output_dir + "/part-0000" if not os.path.exists(args.counter_output_dir): os.mkdir(args.counter_output_dir) + print('writing counters', counters) with open(path, 'w') as wb: wb.write(str(json.dumps(counters))) diff --git a/mrjob/spark/runner.py b/mrjob/spark/runner.py index 02fa4cd51..528ef9a76 100644 --- a/mrjob/spark/runner.py +++ b/mrjob/spark/runner.py @@ -333,9 +333,19 @@ def _run_step_on_spark(self, step, step_num, last_step_num=None): self._counter_output_dir(step_num), 'part-*') counter_json = b''.join(self.fs.cat(counter_file)) log.info('counters %s' % counter_json) + prince('json counters', counter_json) if counter_json.strip(): # json.loads() on Python 3.4/3.5 can't take bytes - counters = json.loads(to_unicode(counter_json)) + try: + counters = json.loads(to_unicode(counter_json)) + except Exception as e: + print(e) + try: + counters = json.loads(counter_json) + except Exception as e: + print('using direct json', e) + counters = counter_json + if isinstance(counters, list): self._counters.extend(counters) From ac4d16d317ac65bbe67330bcce3fd8cfc28413ac Mon Sep 17 00:00:00 2001 From: Manpreet Singh Date: Fri, 22 May 2020 17:50:16 -0700 Subject: [PATCH 08/11] fix test --- mrjob/spark/harness.py | 1 + mrjob/spark/runner.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/mrjob/spark/harness.py b/mrjob/spark/harness.py index 62e42ab0b..f1d9693fc 100644 --- a/mrjob/spark/harness.py +++ b/mrjob/spark/harness.py @@ -293,6 +293,7 @@ def make_mrc_job(mrc, step_num): ) print('find files', glob(args.counter_output_dir+'/*')) print('find part-files files', glob(args.counter_output_dir+'/part-*')) + # Use regular python buildin file writer if the part-* file is not created if not is_uri(args.counter_output_dir) and not glob(args.counter_output_dir+'/part-*'): path = args.counter_output_dir + "/part-0000" if not os.path.exists(args.counter_output_dir): diff --git a/mrjob/spark/runner.py b/mrjob/spark/runner.py index 528ef9a76..43063f9d6 100644 --- a/mrjob/spark/runner.py +++ b/mrjob/spark/runner.py @@ -333,7 +333,7 @@ def _run_step_on_spark(self, step, step_num, last_step_num=None): self._counter_output_dir(step_num), 'part-*') counter_json = b''.join(self.fs.cat(counter_file)) log.info('counters %s' % counter_json) - prince('json counters', counter_json) + print('json counters', counter_json) if counter_json.strip(): # json.loads() on Python 3.4/3.5 can't take bytes try: From 7f86f6811e0ac826b8eac99302412671a22b4d61 Mon Sep 17 00:00:00 2001 From: Manpreet Singh Date: Fri, 22 May 2020 20:47:39 -0700 Subject: [PATCH 09/11] fix tests --- mrjob/spark/harness.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mrjob/spark/harness.py b/mrjob/spark/harness.py index f1d9693fc..52d316032 100644 --- a/mrjob/spark/harness.py +++ b/mrjob/spark/harness.py @@ -291,10 +291,8 @@ def make_mrc_job(mrc, step_num): ).saveAsTextFile( args.counter_output_dir ) - print('find files', glob(args.counter_output_dir+'/*')) - print('find part-files files', glob(args.counter_output_dir+'/part-*')) # Use regular python buildin file writer if the part-* file is not created - if not is_uri(args.counter_output_dir) and not glob(args.counter_output_dir+'/part-*'): + if not is_uri(args.counter_output_dir) and not glob(args.counter_output_dir + "/path-*"): path = args.counter_output_dir + "/part-0000" if not os.path.exists(args.counter_output_dir): os.mkdir(args.counter_output_dir) From bf7607c95f04068e476d313d0be12e5874116849 Mon Sep 17 00:00:00 2001 From: Manpreet Singh Date: Sat, 23 May 2020 20:02:15 -0700 Subject: [PATCH 10/11] fix tests --- mrjob/bin.py | 2 +- mrjob/spark/harness.py | 4 +--- mrjob/spark/runner.py | 4 ---- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/mrjob/bin.py b/mrjob/bin.py index 2a1ede587..59e587edf 100644 --- a/mrjob/bin.py +++ b/mrjob/bin.py @@ -845,7 +845,6 @@ def _run_spark_submit(self, spark_submit_args, env, record_callback): else: # we have PTYs if pid == 0: # we are the child process - log.debug('Invoking spark-submit via PTY') try: os.execvpe(spark_submit_args[0], spark_submit_args, env) # now this process is no longer Python @@ -857,6 +856,7 @@ def _run_spark_submit(self, spark_submit_args, env, record_callback): # if we get some other exception, still exit hard os._exit(-1) else: + log.debug('Invoking spark-submit via PTY') with os.fdopen(master_fd, 'rb') as master: step_interpretation = ( _parse_spark_log( diff --git a/mrjob/spark/harness.py b/mrjob/spark/harness.py index 52d316032..68a40be02 100644 --- a/mrjob/spark/harness.py +++ b/mrjob/spark/harness.py @@ -25,7 +25,6 @@ from mrjob.util import shlex_split from pyspark.accumulators import AccumulatorParam - # tuples of (args, kwargs) for ArgumentParser.add_argument() # # TODO: this is shared code with mr_spark_harness.py, which started out @@ -292,11 +291,10 @@ def make_mrc_job(mrc, step_num): args.counter_output_dir ) # Use regular python buildin file writer if the part-* file is not created - if not is_uri(args.counter_output_dir) and not glob(args.counter_output_dir + "/path-*"): + if not is_uri(args.counter_output_dir) and not glob(args.counter_output_dir + "/part-*"): path = args.counter_output_dir + "/part-0000" if not os.path.exists(args.counter_output_dir): os.mkdir(args.counter_output_dir) - print('writing counters', counters) with open(path, 'w') as wb: wb.write(str(json.dumps(counters))) diff --git a/mrjob/spark/runner.py b/mrjob/spark/runner.py index 43063f9d6..7d12f9e44 100644 --- a/mrjob/spark/runner.py +++ b/mrjob/spark/runner.py @@ -332,18 +332,14 @@ def _run_step_on_spark(self, step, step_num, last_step_num=None): counter_file = self.fs.join( self._counter_output_dir(step_num), 'part-*') counter_json = b''.join(self.fs.cat(counter_file)) - log.info('counters %s' % counter_json) - print('json counters', counter_json) if counter_json.strip(): # json.loads() on Python 3.4/3.5 can't take bytes try: counters = json.loads(to_unicode(counter_json)) except Exception as e: - print(e) try: counters = json.loads(counter_json) except Exception as e: - print('using direct json', e) counters = counter_json From db200a15a3e2d467067b4099700ffef03d1791fb Mon Sep 17 00:00:00 2001 From: Manpreet Singh Date: Sat, 23 May 2020 20:49:17 -0700 Subject: [PATCH 11/11] fix tests --- mrjob/spark/runner.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/mrjob/spark/runner.py b/mrjob/spark/runner.py index 7d12f9e44..f25f09966 100644 --- a/mrjob/spark/runner.py +++ b/mrjob/spark/runner.py @@ -334,14 +334,7 @@ def _run_step_on_spark(self, step, step_num, last_step_num=None): counter_json = b''.join(self.fs.cat(counter_file)) if counter_json.strip(): # json.loads() on Python 3.4/3.5 can't take bytes - try: - counters = json.loads(to_unicode(counter_json)) - except Exception as e: - try: - counters = json.loads(counter_json) - except Exception as e: - counters = counter_json - + counters = json.loads(to_unicode(counter_json)) if isinstance(counters, list): self._counters.extend(counters)