diff --git a/mrjob/bin.py b/mrjob/bin.py index 4584ff721..59e587edf 100644 --- a/mrjob/bin.py +++ b/mrjob/bin.py @@ -857,7 +857,6 @@ def _run_spark_submit(self, spark_submit_args, env, record_callback): os._exit(-1) else: log.debug('Invoking spark-submit via PTY') - with os.fdopen(master_fd, 'rb') as master: step_interpretation = ( _parse_spark_log( diff --git a/mrjob/spark/harness.py b/mrjob/spark/harness.py index 934776eae..68a40be02 100644 --- a/mrjob/spark/harness.py +++ b/mrjob/spark/harness.py @@ -17,13 +17,14 @@ import json from argparse import ArgumentParser from collections import defaultdict +from glob import glob from importlib import import_module from itertools import chain +from mrjob.parse import is_uri from mrjob.util import shlex_split from pyspark.accumulators import AccumulatorParam - # tuples of (args, kwargs) for ArgumentParser.add_argument() # # TODO: this is shared code with mr_spark_harness.py, which started out @@ -281,13 +282,21 @@ def make_mrc_job(mrc, step_num): if args.counter_output_dir is not None: counters = [ca.value for ca in counter_accumulators] + # If the given path is an s3 path, use s3.parallelize, + # otherwise just write them directly to the local dir sc.parallelize( [json.dumps(counters)], numSlices=1 ).saveAsTextFile( args.counter_output_dir ) - + # Use regular python buildin file writer if the part-* file is not created + if not is_uri(args.counter_output_dir) and not glob(args.counter_output_dir + "/part-*"): + path = args.counter_output_dir + "/part-0000" + if not os.path.exists(args.counter_output_dir): + os.mkdir(args.counter_output_dir) + with open(path, 'w') as wb: + wb.write(str(json.dumps(counters))) def _text_file_with_path(sc, path): """Return an RDD that yields (path, line) for each line in the file.