关于scala：createDataFrame获取消息java.lang.String不能强制转换为java.sql.Date

createDataFrame getting message java.lang.String cannot be cast to java.sql.Date

我正在尝试将标头合并为单个文件输出为csv(@Kang引用)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructField, StringType, StructType}

object ListOfSavingFiltered {
def merge(srcPath: String, dstPath: String): Unit = {
val hadoopConfig = new Configuration()
val hdfs = FileSystem.get(hadoopConfig)
FileUtil.copyMerge(hdfs, new Path(srcPath), hdfs, new Path(dstPath), false, hadoopConfig, null)
// the"true" setting deletes the source files once they are merged into the new output
}

def main(args: Array[String]): Unit = {

val url ="jdbc:sqlserver://localhost;databaseName=InsightWarehouse;integratedSecurity=true";
val driver ="com.microsoft.sqlserver.jdbc.SQLServerDriver"

val v_Account ="dbo.v_Account"
val v_Customer ="dbo.v_Customer"

val spark = SparkSession.
builder.master("local[*]")
//.config("spark.debug.maxToStringFields","100")
.appName("Insight Application Big Data")
.getOrCreate()

val dfAccount = spark
.read
.format("jdbc")
.option("url", url)
.option("driver", driver)
.option("dbtable", v_Account)
.load()

val dfCustomer = spark
.read
.format("jdbc")
.option("url", url)
.option("driver", driver)
.option("dbtable", v_Customer)
.load()

val Classification = Seq("Contractual Account","Non-Term Deposit","Term Deposit")

//dfAccount.printSchema()
val joined = dfAccount.as("a")
.join(dfCustomer.as("c"),
Seq("BusinessDate","CustomerID"),"LEFT")
.filter(
dfAccount.col("BusinessDate") ==="2018-11-28"
&& dfAccount.col("Category") ==="Deposit"
// && dfAccount.col("IsActive").equalTo("Yes")
&& dfAccount.col("Classification").isin(Classification: _*)
)

//joined.show()
val columnNames = Seq[String](
"a.AcctBranchName",
"c.CustomerNum",
"c.SourceCustomerId",
"a.SourceAccountId",
"a.AccountNum",
"c.FullName",
"c.LastName",
"c.BirthDate",
"a.Balance",
"a.InterestAccrued",
"a.InterestRate",
"a.SpreadRate",
"a.Classification",
"a.ProductType",
"a.ProductDesc",
"a.StartDate",
"a.MaturityDate",
"a.ClosedDate",
"a.FixOrVar",
"a.Term",
"a.TermUnit",
"a.MonthlyNetIncome",
"a.Status_",
"a.HoldsTotal",
"a.AvailableFunds",
"a.InterestRateIndex",
"a.InterestRateVariance",
"a.FeePlan",
"c.CustEmplFullName",
"a.IsActive",
"c.Residence",
"c.Village",
"c.Province",
"c.Commune",
"c.District",
"a.Currency",
"c.TaxType",
"c.TaxRate",
"RollOverStatus"
)

val outputfile ="src/main/resources/out/"
var filename ="lifOfSaving.csv.gz"
var outputFileName = outputfile +"/temp_" + filename
var mergedFileName = outputfile +"/merged_" + filename
var mergeFindGlob = outputFileName

val responseWithSelectedColumns = joined.select(columnNames.map(c => col(c)): _*)
.withColumn("RollOverStatus", when(col("RollOverStatus").equalTo("Y"),"Yes").otherwise("No"))

//create a new data frame containing only header names
import scala.collection.JavaConverters._
val headerDF = spark.createDataFrame(List(Row.fromSeq(responseWithSelectedColumns.columns.toSeq)).asJava, responseWithSelectedColumns.schema)

//merge header names with data
headerDF.union(responseWithSelectedColumns)
// .coalesce(1) //So just a single part- file will be created
.repartition(4)
.write.mode("overwrite")
.option("codec","org.apache.hadoop.io.compress.GzipCodec")
.format("com.databricks.spark.csv")
.option("charset","UTF8")
.option("mapreduce.fileoutputcommitter.marksuccessfuljobs","false") //Avoid creating of crc files
.option("header","false") //Write the header

.save(outputFileName)
merge(mergeFindGlob, mergedFileName)
responseWithSelectedColumns.unpersist()

spark.stop()
}
}

该代码似乎正确，但仍会收到以下错误消息：

1 2	Exception in thread"main" java.lang.ClassCastException: java.lang.String cannot be cast to java.sql.Date at org.apache.spark.sql.catalyst.CatalystTypeConverters$DateConverter$.toCatalystImpl(CatalystTypeConverters.scala:300)

有人请帮忙吗？