diff --git a/feathr-impl/src/main/scala/com/linkedin/feathr/offline/transformation/AnchorToDataSourceMapper.scala b/feathr-impl/src/main/scala/com/linkedin/feathr/offline/transformation/AnchorToDataSourceMapper.scala index ef714450b..db76d4ba0 100644 --- a/feathr-impl/src/main/scala/com/linkedin/feathr/offline/transformation/AnchorToDataSourceMapper.scala +++ b/feathr-impl/src/main/scala/com/linkedin/feathr/offline/transformation/AnchorToDataSourceMapper.scala @@ -4,7 +4,7 @@ import java.time.Duration import com.linkedin.feathr.common.{DateParam, DateTimeResolution} import com.linkedin.feathr.offline.source.SourceFormatType._ import com.linkedin.feathr.offline.anchored.feature.FeatureAnchorWithSource -import com.linkedin.feathr.offline.config.location.{PathList, SimplePath} +import com.linkedin.feathr.offline.config.location.{DataLocation, PathList, SimplePath} import com.linkedin.feathr.offline.generation.IncrementalAggContext import com.linkedin.feathr.offline.source.DataSource import com.linkedin.feathr.offline.source.accessor.DataSourceAccessor @@ -96,16 +96,19 @@ private[offline] class AnchorToDataSourceMapper(dataPathHandlers: List[DataPathH val dataLoaderHandlers: List[DataLoaderHandler] = dataPathHandlers.map(_.dataLoaderHandler) // Only file-based source has real "path", others are just single dataset - val adjustedObsTimeRange = if (factDataSource.location.isFileBasedLocation()) { + val (adjustedObsTimeRange, dataSourcePath) = if (factDataSource.location.isFileBasedLocation()) { val pathChecker = PathChecker(ss, dataLoaderHandlers) val pathAnalyzer = new TimeBasedHdfsPathAnalyzer(pathChecker, dataLoaderHandlers) val pathInfo = pathAnalyzer.analyze(factDataSource.path) if (pathInfo.dateTimeResolution == DateTimeResolution.DAILY) { - obsTimeRange.adjustWithDateTimeResolution(DateTimeResolution.DAILY) - } else obsTimeRange + (obsTimeRange.adjustWithDateTimeResolution(DateTimeResolution.DAILY), pathInfo.basePath) + } else (obsTimeRange, pathInfo.basePath) } else { - obsTimeRange + (obsTimeRange, factDataSource.path) } + // Copy the pathInfo's path into the datasource path as it adds the daily/hourly keyword if it is missing from the path + val updatedFactDataSource = DataSource(dataSourcePath, factDataSource.sourceType, factDataSource.timeWindowParams, + factDataSource.timePartitionPattern, factDataSource.postfixPath) val timeInterval = OfflineDateTimeUtils.getFactDataTimeRange(adjustedObsTimeRange, window, timeDelays) val needCreateTimestampColumn = SlidingWindowFeatureUtils.needCreateTimestampColumnFromPartition(factDataSource) @@ -115,7 +118,7 @@ private[offline] class AnchorToDataSourceMapper(dataPathHandlers: List[DataPathH val timeSeriesSource = DataSourceAccessor( ss = ss, - source = factDataSource, + source = updatedFactDataSource, dateIntervalOpt = Some(timeInterval), expectDatumType = None, failOnMissingPartition = failOnMissingPartition, diff --git a/feathr-impl/src/test/scala/com/linkedin/feathr/offline/SlidingWindowAggIntegTest.scala b/feathr-impl/src/test/scala/com/linkedin/feathr/offline/SlidingWindowAggIntegTest.scala index 9e45ec992..176e4a874 100644 --- a/feathr-impl/src/test/scala/com/linkedin/feathr/offline/SlidingWindowAggIntegTest.scala +++ b/feathr-impl/src/test/scala/com/linkedin/feathr/offline/SlidingWindowAggIntegTest.scala @@ -315,6 +315,77 @@ class SlidingWindowAggIntegTest extends FeathrIntegTest { assertEquals(df.getAs[Float]("simpleFeature"), 20f) } + /** + * SWA test when path does not have daily attached to it. It should work as expected. + */ + @Test + def testSwaWithMalformedPath(): Unit = { + val joinConfigAsString = + """ + | settings: { + | observationDataTimeSettings: { + | absoluteTimeRange: { + | timeFormat: yyyy-MM-dd + | startTime: "2018-05-01" + | endTime: "2018-05-03" + | } + | } + | joinTimeSettings: { + | timestampColumn: { + | def: timestamp + | format: yyyy-MM-dd + | } + | } + |} + | + |features: [ + | { + | key: [x], + | featureList: ["simplePageViewCount", "simpleFeature"] + | } + |] + """.stripMargin + val featureDefAsString = + """ + |sources: { + | swaSource: { + | location: { path: "slidingWindowAgg/localSWADefaultTest/" } + | timePartitionPattern: "yyyy/MM/dd" + | timeWindowParameters: { + | timestampColumn: "timestamp" + | timestampColumnFormat: "yyyy-MM-dd" + | } + | } + |} + | + |anchors: { + | swaAnchor: { + | source: "swaSource" + | key: "x" + | features: { + | simplePageViewCount: { + | def: "aggregationWindow" + | aggregation: COUNT + | window: 3d + | default: 10 + | } + | simpleFeature: { + | def: "aggregationWindow" + | aggregation: COUNT + | window: 3d + | default: 20 + | } + | } + | } + |} + """.stripMargin + val res = runLocalFeatureJoinForTest(joinConfigAsString, featureDefAsString, observationDataPath = "slidingWindowAgg/localAnchorTestObsData.avro.json").data + res.show() + val df = res.collect()(0) + assertEquals(df.getAs[Float]("simplePageViewCount"), 10f) + assertEquals(df.getAs[Float]("simpleFeature"), 20f) + } + /** * SWA test with missing features. To enable this test, set the value of FeatureUtils.SKIP_MISSING_FEATURE to True. From * Spark 3.1, SparkContext.updateConf() is not supported.