1
1
package spark
2
2
3
3
import java .io .EOFException
4
- import java .net .URL
5
4
import java .io .ObjectInputStream
6
- import java .util . concurrent . atomic . AtomicLong
5
+ import java .net . URL
7
6
import java .util .Random
8
7
import java .util .Date
9
8
import java .util .{HashMap => JHashMap }
9
+ import java .util .concurrent .atomic .AtomicLong
10
10
11
- import scala .collection .mutable .ArrayBuffer
12
11
import scala .collection .Map
13
- import scala .collection .mutable .HashMap
14
12
import scala .collection .JavaConversions .mapAsScalaMap
13
+ import scala .collection .mutable .ArrayBuffer
14
+ import scala .collection .mutable .HashMap
15
15
16
16
import org .apache .hadoop .io .BytesWritable
17
17
import org .apache .hadoop .io .NullWritable
@@ -47,7 +47,7 @@ import spark.storage.StorageLevel
47
47
import SparkContext ._
48
48
49
49
/**
50
- * A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. Represents an immutable,
50
+ * A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. Represents an immutable,
51
51
* partitioned collection of elements that can be operated on in parallel. This class contains the
52
52
* basic operations available on all RDDs, such as `map`, `filter`, and `persist`. In addition,
53
53
* [[spark.PairRDDFunctions ]] contains operations available only on RDDs of key-value pairs, such
@@ -86,28 +86,28 @@ abstract class RDD[T: ClassManifest](@transient sc: SparkContext) extends Serial
86
86
@ transient val dependencies : List [Dependency [_]]
87
87
88
88
// Methods available on all RDDs:
89
-
89
+
90
90
/** Record user function generating this RDD. */
91
91
private [spark] val origin = Utils .getSparkCallSite
92
-
92
+
93
93
/** Optionally overridden by subclasses to specify how they are partitioned. */
94
94
val partitioner : Option [Partitioner ] = None
95
95
96
96
/** Optionally overridden by subclasses to specify placement preferences. */
97
97
def preferredLocations (split : Split ): Seq [String ] = Nil
98
-
98
+
99
99
/** The [[spark.SparkContext ]] that this RDD was created on. */
100
100
def context = sc
101
101
102
102
private [spark] def elementClassManifest : ClassManifest [T ] = classManifest[T ]
103
-
103
+
104
104
/** A unique ID for this RDD (within its SparkContext). */
105
105
val id = sc.newRddId()
106
-
106
+
107
107
// Variables relating to persistence
108
108
private var storageLevel : StorageLevel = StorageLevel .NONE
109
-
110
- /**
109
+
110
+ /**
111
111
* Set this RDD's storage level to persist its values across operations after the first time
112
112
* it is computed. Can only be called once on each RDD.
113
113
*/
@@ -123,32 +123,32 @@ abstract class RDD[T: ClassManifest](@transient sc: SparkContext) extends Serial
123
123
124
124
/** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
125
125
def persist (): RDD [T ] = persist(StorageLevel .MEMORY_ONLY )
126
-
126
+
127
127
/** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
128
128
def cache (): RDD [T ] = persist()
129
129
130
130
/** Get the RDD's current storage level, or StorageLevel.NONE if none is set. */
131
131
def getStorageLevel = storageLevel
132
-
132
+
133
133
private [spark] def checkpoint (level : StorageLevel = StorageLevel .MEMORY_AND_DISK_2 ): RDD [T ] = {
134
134
if (! level.useDisk && level.replication < 2 ) {
135
135
throw new Exception (" Cannot checkpoint without using disk or replication (level requested was " + level + " )" )
136
- }
137
-
136
+ }
137
+
138
138
// This is a hack. Ideally this should re-use the code used by the CacheTracker
139
139
// to generate the key.
140
140
def getSplitKey (split : Split ) = " rdd_%d_%d" .format(this .id, split.index)
141
-
141
+
142
142
persist(level)
143
143
sc.runJob(this , (iter : Iterator [T ]) => {} )
144
-
144
+
145
145
val p = this .partitioner
146
-
146
+
147
147
new BlockRDD [T ](sc, splits.map(getSplitKey).toArray) {
148
- override val partitioner = p
148
+ override val partitioner = p
149
149
}
150
150
}
151
-
151
+
152
152
/**
153
153
* Internal method to this RDD; will read from cache if applicable, or otherwise compute it.
154
154
* This should ''not'' be called by users directly, but is available for implementors of custom
@@ -161,9 +161,9 @@ abstract class RDD[T: ClassManifest](@transient sc: SparkContext) extends Serial
161
161
compute(split)
162
162
}
163
163
}
164
-
164
+
165
165
// Transformations (return a new RDD)
166
-
166
+
167
167
/**
168
168
* Return a new RDD by applying a function to all elements of this RDD.
169
169
*/
@@ -199,13 +199,13 @@ abstract class RDD[T: ClassManifest](@transient sc: SparkContext) extends Serial
199
199
var multiplier = 3.0
200
200
var initialCount = count()
201
201
var maxSelected = 0
202
-
202
+
203
203
if (initialCount > Integer .MAX_VALUE - 1 ) {
204
204
maxSelected = Integer .MAX_VALUE - 1
205
205
} else {
206
206
maxSelected = initialCount.toInt
207
207
}
208
-
208
+
209
209
if (num > initialCount) {
210
210
total = maxSelected
211
211
fraction = math.min(multiplier * (maxSelected + 1 ) / initialCount, 1.0 )
@@ -215,14 +215,14 @@ abstract class RDD[T: ClassManifest](@transient sc: SparkContext) extends Serial
215
215
fraction = math.min(multiplier * (num + 1 ) / initialCount, 1.0 )
216
216
total = num
217
217
}
218
-
218
+
219
219
val rand = new Random (seed)
220
220
var samples = this .sample(withReplacement, fraction, rand.nextInt).collect()
221
-
221
+
222
222
while (samples.length < total) {
223
223
samples = this .sample(withReplacement, fraction, rand.nextInt).collect()
224
224
}
225
-
225
+
226
226
Utils .randomizeInPlace(samples, rand).take(total)
227
227
}
228
228
@@ -290,8 +290,10 @@ abstract class RDD[T: ClassManifest](@transient sc: SparkContext) extends Serial
290
290
* Return a new RDD by applying a function to each partition of this RDD, while tracking the index
291
291
* of the original partition.
292
292
*/
293
- def mapPartitionsWithSplit [U : ClassManifest ](f : (Int , Iterator [T ]) => Iterator [U ]): RDD [U ] =
294
- new MapPartitionsWithSplitRDD (this , sc.clean(f))
293
+ def mapPartitionsWithSplit [U : ClassManifest ](
294
+ f : (Int , Iterator [T ]) => Iterator [U ],
295
+ preservesPartitioning : Boolean = false ): RDD [U ] =
296
+ new MapPartitionsWithSplitRDD (this , sc.clean(f), preservesPartitioning)
295
297
296
298
// Actions (launch a job to return a value to the user program)
297
299
@@ -342,7 +344,7 @@ abstract class RDD[T: ClassManifest](@transient sc: SparkContext) extends Serial
342
344
343
345
/**
344
346
* Aggregate the elements of each partition, and then the results for all the partitions, using a
345
- * given associative function and a neutral "zero value". The function op(t1, t2) is allowed to
347
+ * given associative function and a neutral "zero value". The function op(t1, t2) is allowed to
346
348
* modify t1 and return it as its result value to avoid object allocation; however, it should not
347
349
* modify t2.
348
350
*/
@@ -443,7 +445,7 @@ abstract class RDD[T: ClassManifest](@transient sc: SparkContext) extends Serial
443
445
val evaluator = new GroupedCountEvaluator [T ](splits.size, confidence)
444
446
sc.runApproximateJob(this , countPartition, evaluator, timeout)
445
447
}
446
-
448
+
447
449
/**
448
450
* Take the first num elements of the RDD. This currently scans the partitions *one by one*, so
449
451
* it will be slow if a lot of partitions are required. In that case, use collect() to get the
0 commit comments