Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion python/pyspark/sql/tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,9 @@ def test_function_parity(self):
missing_in_py = jvm_fn_set.difference(py_fn_set)

# Functions that we expect to be missing in python until they are added to pyspark
expected_missing_in_py = set()
expected_missing_in_py = {
"unix_nanos", # SPARK-57527: PySpark support tracked as a follow-up
}

self.assertEqual(
expected_missing_in_py, missing_in_py, "Missing functions in pyspark not as expected"
Expand Down
10 changes: 10 additions & 0 deletions sql/api/src/main/scala/org/apache/spark/sql/functions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -8199,6 +8199,16 @@ object functions {
*/
def unix_micros(e: Column): Column = Column.fn("unix_micros", e)

/**
* Returns the number of nanoseconds since 1970-01-01 00:00:00 UTC for a nanosecond-precision
* timestamp (`TIMESTAMP_LTZ(p)` / `TIMESTAMP_NTZ(p)`, `p` in `[7, 9]`). The result is a
* lossless `DECIMAL(21, 0)`.
*
* @group datetime_funcs
* @since 4.3.0
*/
def unix_nanos(e: Column): Column = Column.fn("unix_nanos", e)

/**
* Returns the number of milliseconds since 1970-01-01 00:00:00 UTC. Truncates higher levels of
* precision.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -777,6 +777,7 @@ object FunctionRegistry {
expression[UnixSeconds]("unix_seconds"),
expression[UnixMillis]("unix_millis"),
expression[UnixMicros]("unix_micros"),
expression[UnixNanos]("unix_nanos"),
expression[ConvertTimezone]("convert_timezone"),
expressionBuilder("time_bucket", TimeBucketExpressionBuilder),

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.spark.sql.catalyst.expressions

import java.math.BigInteger
import java.text.ParseException
import java.time.{DateTimeException, LocalDate, LocalDateTime, ZoneId, ZoneOffset}
import java.time.format.DateTimeParseException
Expand Down Expand Up @@ -854,6 +855,57 @@ case class UnixMicros(child: Expression) extends TimestampToLongBase {
copy(child = newChild)
}

// scalastyle:off line.contains.tab
@ExpressionDescription(
usage = "_FUNC_(timestamp) - Returns the number of nanoseconds since 1970-01-01 00:00:00 UTC.",
examples = """
Examples:
> SET spark.sql.timestampNanosTypes.enabled=true;
spark.sql.timestampNanosTypes.enabled true
> SELECT _FUNC_(TIMESTAMP_NTZ '2008-12-25 15:30:00.123456789');
1230219000123456789
""",
group = "datetime_funcs",
since = "4.3.0")
// scalastyle:on line.contains.tab
case class UnixNanos(child: Expression)
extends UnaryExpression with ExpectsInputTypes {
override def nullIntolerant: Boolean = true

// Accepts only the nanosecond-precision timestamp types TIMESTAMP_LTZ(p) / TIMESTAMP_NTZ(p)
// (p in [7, 9]); the microsecond timestamp types are intentionally not supported here.
override def inputTypes: Seq[AbstractDataType] = Seq(AnyTimestampNanoType)

// epochMicros * 1000 overflows a 64-bit BIGINT across the full [0001..9999] calendar range, so
// the result is a lossless DECIMAL with enough precision to hold every value (~2.5e20 max).
override def dataType: DataType = DecimalType(21, 0)

override def nullSafeEval(input: Any): Any = {
val v = input.asInstanceOf[TimestampNanosVal]
val nanos = BigInteger.valueOf(v.epochMicros)
.multiply(BigInteger.valueOf(NANOS_PER_MICROS))
.add(BigInteger.valueOf(v.nanosWithinMicro.toLong))
Decimal.apply(new java.math.BigDecimal(nanos), 21, 0)
}

override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
nullSafeCodeGen(ctx, ev, c => {
val bi = ctx.freshName("nanos")
s"""
|java.math.BigInteger $bi = java.math.BigInteger.valueOf($c.epochMicros)
| .multiply(java.math.BigInteger.valueOf(${NANOS_PER_MICROS}L))
| .add(java.math.BigInteger.valueOf($c.nanosWithinMicro));
|${ev.value} = Decimal.apply(new java.math.BigDecimal($bi), 21, 0);
|""".stripMargin
})
}

override def prettyName: String = "unix_nanos"

override protected def withNewChildInternal(newChild: Expression): UnixNanos =
copy(child = newChild)
}

// scalastyle:off line.contains.tab
@ExpressionDescription(
usage = "_FUNC_(date) - Returns the year component of the date/timestamp.",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf.TimestampTypes
import org.apache.spark.sql.types._
import org.apache.spark.sql.types.DataTypeTestUtils.{dayTimeIntervalTypes, yearMonthIntervalTypes}
import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
import org.apache.spark.unsafe.types.{CalendarInterval, TimestampNanosVal, UTF8String}

class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {

Expand Down Expand Up @@ -1696,6 +1696,43 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation(UnixMicros(Literal(timestampWithNanos)), 1000001L)
}

test("SPARK-57527: unix_nanos over nanosecond-precision timestamps") {
import org.apache.spark.sql.catalyst.util.TimestampNanosTestUtils._

def expectedNanos(v: TimestampNanosVal): Decimal = {
val nanos = BigInt(v.epochMicros) * NANOS_PER_MICROS + v.nanosWithinMicro.toInt
Decimal(BigDecimal(nanos), 21, 0)
}

// 2008-12-25 15:30:00.123456789 -> 1230219000123456789 nanos since the epoch. unix_nanos
// applies no zone shift, so the NTZ wall-clock value and the LTZ instant at the same UTC
// reading produce the same result; the declared precision does not re-truncate the value.
val ntz = localDateTimeToNanosVal(timestampNTZ(2008, 12, 25, 15, 30, 0, 123456789))
val ltz = instantToNanosVal(Instant.parse("2008-12-25T15:30:00.123456789Z"))
val post = Decimal(BigDecimal("1230219000123456789"), 21, 0)
foreachNanosPrecision { p =>
checkEvaluation(UnixNanos(Literal.create(ntz, TimestampNTZNanosType(p))), post)
checkEvaluation(UnixNanos(Literal.create(ltz, TimestampLTZNanosType(p))), post)
}

// Pre-epoch value exercises the negative-epoch path.
val preEpoch = localDateTimeToNanosVal(timestampNTZ(1960, 1, 1, 0, 0, 0, 1))
checkEvaluation(
UnixNanos(Literal.create(preEpoch, TimestampNTZNanosType(9))), expectedNanos(preEpoch))

// Far-future value: epochMicros * 1000 overflows a 64-bit BIGINT, so the DECIMAL result must
// exceed Long.MaxValue and the computation must not be done in long arithmetic.
val far = localDateTimeToNanosVal(timestampNTZ(9999, 12, 31, 23, 59, 59, 999999999))
checkEvaluation(UnixNanos(Literal.create(far, TimestampNTZNanosType(9))), expectedNanos(far))
val farResult =
UnixNanos(Literal.create(far, TimestampNTZNanosType(9))).eval().asInstanceOf[Decimal]
assert(farResult.toJavaBigDecimal.compareTo(java.math.BigDecimal.valueOf(Long.MaxValue)) > 0)

// NULL input.
checkEvaluation(UnixNanos(Literal.create(null, TimestampNTZNanosType(9))), null)
checkEvaluation(UnixNanos(Literal.create(null, TimestampLTZNanosType(9))), null)
}

test("TIMESTAMP_SECONDS") {
def testIntegralFunc(value: Number): Unit = {
checkEvaluation(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,7 @@
| org.apache.spark.sql.catalyst.expressions.UnixDate | unix_date | SELECT unix_date(DATE("1970-01-02")) | struct<unix_date(1970-01-02):int> |
| org.apache.spark.sql.catalyst.expressions.UnixMicros | unix_micros | SELECT unix_micros(TIMESTAMP('1970-01-01 00:00:01Z')) | struct<unix_micros(1970-01-01 00:00:01Z):bigint> |
| org.apache.spark.sql.catalyst.expressions.UnixMillis | unix_millis | SELECT unix_millis(TIMESTAMP('1970-01-01 00:00:01Z')) | struct<unix_millis(1970-01-01 00:00:01Z):bigint> |
| org.apache.spark.sql.catalyst.expressions.UnixNanos | unix_nanos | SELECT unix_nanos(TIMESTAMP_NTZ '2008-12-25 15:30:00.123456789') | struct<unix_nanos(TIMESTAMP_NTZ '2008-12-25 15:30:00.123456789'):decimal(21,0)> |
| org.apache.spark.sql.catalyst.expressions.UnixSeconds | unix_seconds | SELECT unix_seconds(TIMESTAMP('1970-01-01 00:00:01Z')) | struct<unix_seconds(1970-01-01 00:00:01Z):bigint> |
| org.apache.spark.sql.catalyst.expressions.UnixTimestamp | unix_timestamp | SELECT unix_timestamp() | struct<unix_timestamp(current_timestamp(), yyyy-MM-dd HH:mm:ss):bigint> |
| org.apache.spark.sql.catalyst.expressions.Upper | ucase | SELECT ucase('SparkSql') | struct<ucase(SparkSql):string> |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -696,3 +696,45 @@ SELECT unix_timestamp(NULL :: timestamp_ltz(9)), to_unix_timestamp(NULL :: times
-- !query analysis
Project [unix_timestamp(cast(null as timestamp_ltz(9)), yyyy-MM-dd HH:mm:ss, Some(America/Los_Angeles), true) AS unix_timestamp(CAST(NULL AS TIMESTAMP_LTZ(9)), yyyy-MM-dd HH:mm:ss)#xL, to_unix_timestamp(cast(null as timestamp_ltz(9)), yyyy-MM-dd HH:mm:ss, Some(America/Los_Angeles), true) AS to_unix_timestamp(CAST(NULL AS TIMESTAMP_LTZ(9)), yyyy-MM-dd HH:mm:ss)#xL]
+- OneRowRelation


-- !query
SELECT unix_nanos(TIMESTAMP_LTZ '2020-01-01 13:24:35.123456789 UTC')
-- !query analysis
Project [unix_nanos(2020-01-01 05:24:35.123456789) AS unix_nanos(TIMESTAMP_LTZ '2020-01-01 05:24:35.123456789')#x]
+- OneRowRelation


-- !query
SELECT unix_nanos('2020-01-01 13:24:35.123456789 UTC' :: timestamp_ltz(7))
-- !query analysis
Project [unix_nanos(cast(2020-01-01 13:24:35.123456789 UTC as timestamp_ltz(7))) AS unix_nanos(CAST(2020-01-01 13:24:35.123456789 UTC AS TIMESTAMP_LTZ(7)))#x]
+- OneRowRelation


-- !query
SELECT unix_nanos('2020-01-01 13:24:35.123456789 UTC' :: timestamp_ltz(8))
-- !query analysis
Project [unix_nanos(cast(2020-01-01 13:24:35.123456789 UTC as timestamp_ltz(8))) AS unix_nanos(CAST(2020-01-01 13:24:35.123456789 UTC AS TIMESTAMP_LTZ(8)))#x]
+- OneRowRelation


-- !query
SELECT unix_nanos(TIMESTAMP_LTZ '9999-12-31 23:59:59.999999999 UTC')
-- !query analysis
Project [unix_nanos(9999-12-31 15:59:59.999999999) AS unix_nanos(TIMESTAMP_LTZ '9999-12-31 15:59:59.999999999')#x]
+- OneRowRelation


-- !query
SELECT unix_nanos(TIMESTAMP_LTZ '1960-01-01 00:00:00.000000001 UTC')
-- !query analysis
Project [unix_nanos(1959-12-31 16:00:00.000000001) AS unix_nanos(TIMESTAMP_LTZ '1959-12-31 16:00:00.000000001')#x]
+- OneRowRelation


-- !query
SELECT unix_nanos(NULL :: timestamp_ltz(9))
-- !query analysis
Project [unix_nanos(cast(null as timestamp_ltz(9))) AS unix_nanos(CAST(NULL AS TIMESTAMP_LTZ(9)))#x]
+- OneRowRelation
Original file line number Diff line number Diff line change
Expand Up @@ -616,3 +616,45 @@ SELECT unix_timestamp(NULL :: timestamp_ntz(9)), to_unix_timestamp(NULL :: times
-- !query analysis
Project [unix_timestamp(cast(null as timestamp_ntz(9)), yyyy-MM-dd HH:mm:ss, Some(America/Los_Angeles), true) AS unix_timestamp(CAST(NULL AS TIMESTAMP_NTZ(9)), yyyy-MM-dd HH:mm:ss)#xL, to_unix_timestamp(cast(null as timestamp_ntz(9)), yyyy-MM-dd HH:mm:ss, Some(America/Los_Angeles), true) AS to_unix_timestamp(CAST(NULL AS TIMESTAMP_NTZ(9)), yyyy-MM-dd HH:mm:ss)#xL]
+- OneRowRelation


-- !query
SELECT unix_nanos(TIMESTAMP_NTZ '2020-01-01 13:24:35.123456789')
-- !query analysis
Project [unix_nanos(2020-01-01 13:24:35.123456789) AS unix_nanos(TIMESTAMP_NTZ '2020-01-01 13:24:35.123456789')#x]
+- OneRowRelation


-- !query
SELECT unix_nanos('2020-01-01 13:24:35.123456789' :: timestamp_ntz(7))
-- !query analysis
Project [unix_nanos(cast(2020-01-01 13:24:35.123456789 as timestamp_ntz(7))) AS unix_nanos(CAST(2020-01-01 13:24:35.123456789 AS TIMESTAMP_NTZ(7)))#x]
+- OneRowRelation


-- !query
SELECT unix_nanos('2020-01-01 13:24:35.123456789' :: timestamp_ntz(8))
-- !query analysis
Project [unix_nanos(cast(2020-01-01 13:24:35.123456789 as timestamp_ntz(8))) AS unix_nanos(CAST(2020-01-01 13:24:35.123456789 AS TIMESTAMP_NTZ(8)))#x]
+- OneRowRelation


-- !query
SELECT unix_nanos(TIMESTAMP_NTZ '9999-12-31 23:59:59.999999999')
-- !query analysis
Project [unix_nanos(9999-12-31 23:59:59.999999999) AS unix_nanos(TIMESTAMP_NTZ '9999-12-31 23:59:59.999999999')#x]
+- OneRowRelation


-- !query
SELECT unix_nanos(TIMESTAMP_NTZ '1960-01-01 00:00:00.000000001')
-- !query analysis
Project [unix_nanos(1960-01-01 00:00:00.000000001) AS unix_nanos(TIMESTAMP_NTZ '1960-01-01 00:00:00.000000001')#x]
+- OneRowRelation


-- !query
SELECT unix_nanos(NULL :: timestamp_ntz(9))
-- !query analysis
Project [unix_nanos(cast(null as timestamp_ntz(9))) AS unix_nanos(CAST(NULL AS TIMESTAMP_NTZ(9)))#x]
+- OneRowRelation
Original file line number Diff line number Diff line change
Expand Up @@ -187,3 +187,16 @@ SELECT unix_timestamp('2020-01-01 13:24:35.999999999' :: timestamp_ltz(7));
SELECT unix_timestamp(TIMESTAMP_LTZ '1969-12-31 23:59:59.500000000 UTC');
-- NULL nanosecond timestamp.
SELECT unix_timestamp(NULL :: timestamp_ltz(9)), to_unix_timestamp(NULL :: timestamp_ltz(9));

-- SPARK-57527: unix_nanos over nanosecond-precision values returns DECIMAL(21, 0) nanoseconds since
-- the epoch. The explicit-zone literals below fix the instant directly, independent of the session
-- time zone. The sub-microsecond digits are kept, truncated to the type's precision.
SELECT unix_nanos(TIMESTAMP_LTZ '2020-01-01 13:24:35.123456789 UTC');
SELECT unix_nanos('2020-01-01 13:24:35.123456789 UTC' :: timestamp_ltz(7));
SELECT unix_nanos('2020-01-01 13:24:35.123456789 UTC' :: timestamp_ltz(8));
-- Far-future value: epochMicros * 1000 overflows a 64-bit BIGINT, exercising the DECIMAL path.
SELECT unix_nanos(TIMESTAMP_LTZ '9999-12-31 23:59:59.999999999 UTC');
-- Pre-epoch value exercises the negative-epoch path.
SELECT unix_nanos(TIMESTAMP_LTZ '1960-01-01 00:00:00.000000001 UTC');
-- NULL nanosecond timestamp.
SELECT unix_nanos(NULL :: timestamp_ltz(9));
Original file line number Diff line number Diff line change
Expand Up @@ -161,3 +161,16 @@ SELECT to_unix_timestamp('2020-01-01 13:24:35.000000001' :: timestamp_ntz(9));
SELECT unix_timestamp(TIMESTAMP_NTZ '1969-12-31 23:59:59.500000000');
-- NULL nanosecond timestamp.
SELECT unix_timestamp(NULL :: timestamp_ntz(9)), to_unix_timestamp(NULL :: timestamp_ntz(9));

-- SPARK-57527: unix_nanos over nanosecond-precision values returns DECIMAL(21, 0) nanoseconds since
-- the epoch; NTZ applies no zone shift, so the wall-clock value is read as the epoch instant. The
-- sub-microsecond digits are kept, truncated to the type's precision.
SELECT unix_nanos(TIMESTAMP_NTZ '2020-01-01 13:24:35.123456789');
SELECT unix_nanos('2020-01-01 13:24:35.123456789' :: timestamp_ntz(7));
SELECT unix_nanos('2020-01-01 13:24:35.123456789' :: timestamp_ntz(8));
-- Far-future value: epochMicros * 1000 overflows a 64-bit BIGINT, exercising the DECIMAL path.
SELECT unix_nanos(TIMESTAMP_NTZ '9999-12-31 23:59:59.999999999');
-- Pre-epoch value exercises the negative-epoch path.
SELECT unix_nanos(TIMESTAMP_NTZ '1960-01-01 00:00:00.000000001');
-- NULL nanosecond timestamp.
SELECT unix_nanos(NULL :: timestamp_ntz(9));
Original file line number Diff line number Diff line change
Expand Up @@ -782,3 +782,51 @@ SELECT unix_timestamp(NULL :: timestamp_ltz(9)), to_unix_timestamp(NULL :: times
struct<unix_timestamp(CAST(NULL AS TIMESTAMP_LTZ(9)), yyyy-MM-dd HH:mm:ss):bigint,to_unix_timestamp(CAST(NULL AS TIMESTAMP_LTZ(9)), yyyy-MM-dd HH:mm:ss):bigint>
-- !query output
NULL NULL


-- !query
SELECT unix_nanos(TIMESTAMP_LTZ '2020-01-01 13:24:35.123456789 UTC')
-- !query schema
struct<unix_nanos(TIMESTAMP_LTZ '2020-01-01 05:24:35.123456789'):decimal(21,0)>
-- !query output
1577885075123456789


-- !query
SELECT unix_nanos('2020-01-01 13:24:35.123456789 UTC' :: timestamp_ltz(7))
-- !query schema
struct<unix_nanos(CAST(2020-01-01 13:24:35.123456789 UTC AS TIMESTAMP_LTZ(7))):decimal(21,0)>
-- !query output
1577885075123456700


-- !query
SELECT unix_nanos('2020-01-01 13:24:35.123456789 UTC' :: timestamp_ltz(8))
-- !query schema
struct<unix_nanos(CAST(2020-01-01 13:24:35.123456789 UTC AS TIMESTAMP_LTZ(8))):decimal(21,0)>
-- !query output
1577885075123456780


-- !query
SELECT unix_nanos(TIMESTAMP_LTZ '9999-12-31 23:59:59.999999999 UTC')
-- !query schema
struct<unix_nanos(TIMESTAMP_LTZ '9999-12-31 15:59:59.999999999'):decimal(21,0)>
-- !query output
253402300799999999999


-- !query
SELECT unix_nanos(TIMESTAMP_LTZ '1960-01-01 00:00:00.000000001 UTC')
-- !query schema
struct<unix_nanos(TIMESTAMP_LTZ '1959-12-31 16:00:00.000000001'):decimal(21,0)>
-- !query output
-315619199999999999


-- !query
SELECT unix_nanos(NULL :: timestamp_ltz(9))
-- !query schema
struct<unix_nanos(CAST(NULL AS TIMESTAMP_LTZ(9))):decimal(21,0)>
-- !query output
NULL
Original file line number Diff line number Diff line change
Expand Up @@ -692,3 +692,51 @@ SELECT unix_timestamp(NULL :: timestamp_ntz(9)), to_unix_timestamp(NULL :: times
struct<unix_timestamp(CAST(NULL AS TIMESTAMP_NTZ(9)), yyyy-MM-dd HH:mm:ss):bigint,to_unix_timestamp(CAST(NULL AS TIMESTAMP_NTZ(9)), yyyy-MM-dd HH:mm:ss):bigint>
-- !query output
NULL NULL


-- !query
SELECT unix_nanos(TIMESTAMP_NTZ '2020-01-01 13:24:35.123456789')
-- !query schema
struct<unix_nanos(TIMESTAMP_NTZ '2020-01-01 13:24:35.123456789'):decimal(21,0)>
-- !query output
1577885075123456789


-- !query
SELECT unix_nanos('2020-01-01 13:24:35.123456789' :: timestamp_ntz(7))
-- !query schema
struct<unix_nanos(CAST(2020-01-01 13:24:35.123456789 AS TIMESTAMP_NTZ(7))):decimal(21,0)>
-- !query output
1577885075123456700


-- !query
SELECT unix_nanos('2020-01-01 13:24:35.123456789' :: timestamp_ntz(8))
-- !query schema
struct<unix_nanos(CAST(2020-01-01 13:24:35.123456789 AS TIMESTAMP_NTZ(8))):decimal(21,0)>
-- !query output
1577885075123456780


-- !query
SELECT unix_nanos(TIMESTAMP_NTZ '9999-12-31 23:59:59.999999999')
-- !query schema
struct<unix_nanos(TIMESTAMP_NTZ '9999-12-31 23:59:59.999999999'):decimal(21,0)>
-- !query output
253402300799999999999


-- !query
SELECT unix_nanos(TIMESTAMP_NTZ '1960-01-01 00:00:00.000000001')
-- !query schema
struct<unix_nanos(TIMESTAMP_NTZ '1960-01-01 00:00:00.000000001'):decimal(21,0)>
-- !query output
-315619199999999999


-- !query
SELECT unix_nanos(NULL :: timestamp_ntz(9))
-- !query schema
struct<unix_nanos(CAST(NULL AS TIMESTAMP_NTZ(9))):decimal(21,0)>
-- !query output
NULL
Loading