关于scala:createDataFrame获取消息java.lang.String不能强制转换为java.sql.Date

createDataFrame getting message java.lang.String cannot be cast to java.sql.Date

我正在尝试将标头合并为单个文件输出为csv(@Kang引用)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructField, StringType, StructType}

object ListOfSavingFiltered {
  def merge(srcPath: String, dstPath: String): Unit = {
    val hadoopConfig = new Configuration()
    val hdfs = FileSystem.get(hadoopConfig)
    FileUtil.copyMerge(hdfs, new Path(srcPath), hdfs, new Path(dstPath), false, hadoopConfig, null)
    // the"true" setting deletes the source files once they are merged into the new output
  }

  def main(args: Array[String]): Unit = {

    val url ="jdbc:sqlserver://localhost;databaseName=InsightWarehouse;integratedSecurity=true";
    val driver ="com.microsoft.sqlserver.jdbc.SQLServerDriver"

    val v_Account ="dbo.v_Account"
    val v_Customer ="dbo.v_Customer"

    val spark = SparkSession.
      builder.master("local[*]")
      //.config("spark.debug.maxToStringFields","100")
      .appName("Insight Application Big Data")
      .getOrCreate()


    val dfAccount = spark
      .read
      .format("jdbc")
      .option("url", url)
      .option("driver", driver)
      .option("dbtable", v_Account)
      .load()

    val dfCustomer = spark
      .read
      .format("jdbc")
      .option("url", url)
      .option("driver", driver)
      .option("dbtable", v_Customer)
      .load()

    val Classification = Seq("Contractual Account","Non-Term Deposit","Term Deposit")

    //dfAccount.printSchema()
    val joined = dfAccount.as("a")
      .join(dfCustomer.as("c"),
        Seq("BusinessDate","CustomerID"),"LEFT")
      .filter(
        dfAccount.col("BusinessDate") ==="2018-11-28"
          && dfAccount.col("Category") ==="Deposit"
          // && dfAccount.col("IsActive").equalTo("Yes")
          && dfAccount.col("Classification").isin(Classification: _*)
         )

    //joined.show()
    val columnNames = Seq[String](
     "a.AcctBranchName",
     "c.CustomerNum",
     "c.SourceCustomerId",
     "a.SourceAccountId",
     "a.AccountNum",
     "c.FullName",
     "c.LastName",
     "c.BirthDate",
     "a.Balance",
     "a.InterestAccrued",
     "a.InterestRate",
     "a.SpreadRate",
     "a.Classification",
     "a.ProductType",
     "a.ProductDesc",
     "a.StartDate",
     "a.MaturityDate",
     "a.ClosedDate",
     "a.FixOrVar",
     "a.Term",
     "a.TermUnit",
     "a.MonthlyNetIncome",
     "a.Status_",
     "a.HoldsTotal",
     "a.AvailableFunds",
     "a.InterestRateIndex",
     "a.InterestRateVariance",
     "a.FeePlan",
     "c.CustEmplFullName",
     "a.IsActive",
     "c.Residence",
     "c.Village",
     "c.Province",
     "c.Commune",
     "c.District",
     "a.Currency",
     "c.TaxType",
     "c.TaxRate",
     "RollOverStatus"
    )

    val outputfile ="src/main/resources/out/"
    var filename ="lifOfSaving.csv.gz"
    var outputFileName = outputfile +"/temp_" + filename
    var mergedFileName = outputfile +"/merged_" + filename
    var mergeFindGlob = outputFileName

    val responseWithSelectedColumns = joined.select(columnNames.map(c => col(c)): _*)
      .withColumn("RollOverStatus", when(col("RollOverStatus").equalTo("Y"),"Yes").otherwise("No"))


    //create a new data frame containing only header names
    import scala.collection.JavaConverters._
    val headerDF = spark.createDataFrame(List(Row.fromSeq(responseWithSelectedColumns.columns.toSeq)).asJava, responseWithSelectedColumns.schema)


    //merge header names with data
    headerDF.union(responseWithSelectedColumns)
      // .coalesce(1) //So just a single part- file will be created
      .repartition(4)
      .write.mode("overwrite")
      .option("codec","org.apache.hadoop.io.compress.GzipCodec")
      .format("com.databricks.spark.csv")
      .option("charset","UTF8")
      .option("mapreduce.fileoutputcommitter.marksuccessfuljobs","false") //Avoid creating of crc files
      .option("header","false") //Write the header

      .save(outputFileName)
    merge(mergeFindGlob, mergedFileName)
    responseWithSelectedColumns.unpersist()

    spark.stop()
  }
}

该代码似乎正确,但仍会收到以下错误消息:

1
2
Exception in thread"main" java.lang.ClassCastException: java.lang.String cannot be cast to java.sql.Date
    at org.apache.spark.sql.catalyst.CatalystTypeConverters$DateConverter$.toCatalystImpl(CatalystTypeConverters.scala:300)

有人请帮忙吗?


您不需要使标头DataFrame与数据模式匹配。

例如。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import org.apache.spark.sql.{SparkSession, functions => sqlfunctions}

val spark =
  SparkSession
  .builder
  .master("local[*]")
  .getOrCreate()
import spark.implicits._

val dataDF =
  List(
    (1,"Luis"),
    (2,"kn3l")
  ).toDF("id","name").withColumn("date", sqlfunctions.current_date())

val headersDF =
  List(
    ("id","name","date")
  ).toDF("id","name","date")

val union = headersDF.unionByName(dataDF)
// union: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: string, name: string, date: string]

union.printSchema()
// root
// |-- id: string (nullable = true)
// |-- name: string (nullable = true)
// |-- date: string (nullable = true)

union.show()
// +---+----+----------+
// | id|name|      date|
// +---+----+----------+
// | id|name|      date|
// |  1|Luis|2018-12-05|
// |  2|kn3l|2018-12-05|
// +---+----+----------+