createDataFrame getting message java.lang.String cannot be cast to java.sql.Date
我正在尝试将标头合并为单个文件输出为csv(@Kang引用)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FileUtil, Path} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.functions._ import org.apache.spark.sql.Row import org.apache.spark.sql.types.{StructField, StringType, StructType} object ListOfSavingFiltered { def merge(srcPath: String, dstPath: String): Unit = { val hadoopConfig = new Configuration() val hdfs = FileSystem.get(hadoopConfig) FileUtil.copyMerge(hdfs, new Path(srcPath), hdfs, new Path(dstPath), false, hadoopConfig, null) // the"true" setting deletes the source files once they are merged into the new output } def main(args: Array[String]): Unit = { val url ="jdbc:sqlserver://localhost;databaseName=InsightWarehouse;integratedSecurity=true"; val driver ="com.microsoft.sqlserver.jdbc.SQLServerDriver" val v_Account ="dbo.v_Account" val v_Customer ="dbo.v_Customer" val spark = SparkSession. builder.master("local[*]") //.config("spark.debug.maxToStringFields","100") .appName("Insight Application Big Data") .getOrCreate() val dfAccount = spark .read .format("jdbc") .option("url", url) .option("driver", driver) .option("dbtable", v_Account) .load() val dfCustomer = spark .read .format("jdbc") .option("url", url) .option("driver", driver) .option("dbtable", v_Customer) .load() val Classification = Seq("Contractual Account","Non-Term Deposit","Term Deposit") //dfAccount.printSchema() val joined = dfAccount.as("a") .join(dfCustomer.as("c"), Seq("BusinessDate","CustomerID"),"LEFT") .filter( dfAccount.col("BusinessDate") ==="2018-11-28" && dfAccount.col("Category") ==="Deposit" // && dfAccount.col("IsActive").equalTo("Yes") && dfAccount.col("Classification").isin(Classification: _*) ) //joined.show() val columnNames = Seq[String]( "a.AcctBranchName", "c.CustomerNum", "c.SourceCustomerId", "a.SourceAccountId", "a.AccountNum", "c.FullName", "c.LastName", "c.BirthDate", "a.Balance", "a.InterestAccrued", "a.InterestRate", "a.SpreadRate", "a.Classification", "a.ProductType", "a.ProductDesc", "a.StartDate", "a.MaturityDate", "a.ClosedDate", "a.FixOrVar", "a.Term", "a.TermUnit", "a.MonthlyNetIncome", "a.Status_", "a.HoldsTotal", "a.AvailableFunds", "a.InterestRateIndex", "a.InterestRateVariance", "a.FeePlan", "c.CustEmplFullName", "a.IsActive", "c.Residence", "c.Village", "c.Province", "c.Commune", "c.District", "a.Currency", "c.TaxType", "c.TaxRate", "RollOverStatus" ) val outputfile ="src/main/resources/out/" var filename ="lifOfSaving.csv.gz" var outputFileName = outputfile +"/temp_" + filename var mergedFileName = outputfile +"/merged_" + filename var mergeFindGlob = outputFileName val responseWithSelectedColumns = joined.select(columnNames.map(c => col(c)): _*) .withColumn("RollOverStatus", when(col("RollOverStatus").equalTo("Y"),"Yes").otherwise("No")) //create a new data frame containing only header names import scala.collection.JavaConverters._ val headerDF = spark.createDataFrame(List(Row.fromSeq(responseWithSelectedColumns.columns.toSeq)).asJava, responseWithSelectedColumns.schema) //merge header names with data headerDF.union(responseWithSelectedColumns) // .coalesce(1) //So just a single part- file will be created .repartition(4) .write.mode("overwrite") .option("codec","org.apache.hadoop.io.compress.GzipCodec") .format("com.databricks.spark.csv") .option("charset","UTF8") .option("mapreduce.fileoutputcommitter.marksuccessfuljobs","false") //Avoid creating of crc files .option("header","false") //Write the header .save(outputFileName) merge(mergeFindGlob, mergedFileName) responseWithSelectedColumns.unpersist() spark.stop() } } |
该代码似乎正确,但仍会收到以下错误消息:
1 2 | Exception in thread"main" java.lang.ClassCastException: java.lang.String cannot be cast to java.sql.Date at org.apache.spark.sql.catalyst.CatalystTypeConverters$DateConverter$.toCatalystImpl(CatalystTypeConverters.scala:300) |
有人请帮忙吗?
您不需要使标头
例如。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | import org.apache.spark.sql.{SparkSession, functions => sqlfunctions} val spark = SparkSession .builder .master("local[*]") .getOrCreate() import spark.implicits._ val dataDF = List( (1,"Luis"), (2,"kn3l") ).toDF("id","name").withColumn("date", sqlfunctions.current_date()) val headersDF = List( ("id","name","date") ).toDF("id","name","date") val union = headersDF.unionByName(dataDF) // union: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: string, name: string, date: string] union.printSchema() // root // |-- id: string (nullable = true) // |-- name: string (nullable = true) // |-- date: string (nullable = true) union.show() // +---+----+----------+ // | id|name| date| // +---+----+----------+ // | id|name| date| // | 1|Luis|2018-12-05| // | 2|kn3l|2018-12-05| // +---+----+----------+ |