Reply
Highlighted
New Contributor
Posts: 3
Registered: ‎07-11-2016

Spark Conflicting partition schema parquet files

Hi All,
unfortunately I have an hard problem with Spark and Scala programming.

I would like to test the my first query in Spark, using Scala and dataframe storage.
Now I'm trying the Cloudera Virtual Machine, with the default database.

This is my code on Scala, with POM.xml configuration:

 

package org.test.spark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive._
import org.apache.hadoop.crypto.key
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.avro.generic.GenericRecord
import parquet.hadoop.ParquetInputFormat
import parquet.avro.AvroReadSupport
import org.apache.spark.rdd.RDD

object SimpleAppDataFrame {

  def main(args: Array[String]) = {
    val conf = new SparkConf()
      .setAppName("SimpleAppDataFrame")
      .setMaster("local")

    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    
    //val hc = new HiveContext(sc)
    //import hc.implicits._

    val products = sqlContext.read.parquet("hdfs://quickstart.cloudera:8888/user/hive/warehouse/products/bc223562-ee45-42a6-b9a0-05635efb3e59.parquet")

    //val parquetFile = sqlContext.read.format("org.apache.spark.sql.parquet").load("hdfs://quickstart.cloudera:8888/user/hive/warehouse/products/bc223562-ee45-42a6-b9a0-05635efb3e59.parquet")
    //val parquetFile = sqlContext.read.parquet("hdfs://quickstart.cloudera:8888/user/hive/warehouse/products/bc223562-ee45-42a6-b9a0-05635efb3e59.parquet")
    //parquetFile.registerTempTable("products")

    //val result = hc.sql("select * from products")
    //parquetFile.show()
  }
}

The POM.xml file:

 

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>org.test</groupId>
	<artifactId>spark</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<dependencies>
		<dependency>
			<groupId>org.apache.spark</groupId>
			<artifactId>spark-mllib_2.10</artifactId>
			<version>1.6.0</version>
		</dependency>
		<!-- https://mvnrepository.com/artifact/com.twitter/parquet-tools -->
		<dependency>
			<groupId>com.twitter</groupId>
			<artifactId>parquet-tools</artifactId>
			<version>1.6.0</version>
		</dependency>
		<!-- https://mvnrepository.com/artifact/com.twitter/parquet-format -->
		<dependency>
			<groupId>com.twitter</groupId>
			<artifactId>parquet-format</artifactId>
			<version>1.0.0</version>
		</dependency>
		<!-- https://mvnrepository.com/artifact/com.twitter/parquet-avro -->
		<dependency>
			<groupId>com.twitter</groupId>
			<artifactId>parquet-avro</artifactId>
			<version>1.5.0</version>
		</dependency>
		<!-- https://mvnrepository.com/artifact/org.apache.parquet/parquet-hadoop -->
		<dependency>
			<groupId>org.apache.parquet</groupId>
			<artifactId>parquet-hadoop</artifactId>
			<version>1.8.1</version>
		</dependency>

		<!-- https://mvnrepository.com/artifact/org.apache.parquet/parquet-avro -->
		<dependency>
			<groupId>org.apache.parquet</groupId>
			<artifactId>parquet-avro</artifactId>
			<version>1.8.1</version>
		</dependency>
		<dependency>
			<groupId>com.databricks</groupId>
			<artifactId>spark-avro_2.10</artifactId>
			<version>3.0.0</version>
		</dependency>
		<dependency> <!-- Spark SQL -->
			<groupId>org.apache.spark</groupId>
			<artifactId>spark-sql_2.10</artifactId>
			<version>1.6.0</version>
		</dependency>
		<!-- http://mvnrepository.com/artifact/com.databricks/spark-avro_2.10 -->
		<dependency>
			<groupId>com.databricks</groupId>
			<artifactId>spark-avro_2.10</artifactId>
			<version>2.0.1</version>
		</dependency>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-client</artifactId>
			<version>2.4.0</version>
		</dependency>
		<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-common</artifactId>
			<version>2.6.0</version>
			<scope>provided</scope>
			<exclusions>
				<exclusion>
					<groupId>javax.servlet</groupId>
					<artifactId>*</artifactId>
				</exclusion>
			</exclusions>
		</dependency>


		<dependency>
			<groupId>org.apache.spark</groupId>
			<artifactId>spark-core_2.10</artifactId>
			<version>1.5.2</version>
		</dependency>
		<dependency>
			<groupId>org.apache.spark</groupId>
			<artifactId>spark-sql_2.10</artifactId>
			<version>1.5.2</version>
		</dependency>
		<dependency>
			<groupId>org.apache.spark</groupId>
			<artifactId>spark-hive_2.10</artifactId>
			<version>1.5.2</version>
		</dependency>

		<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-hdfs</artifactId>
			<version>2.6.1</version>
		</dependency>





	</dependencies>
	<build>

		<pluginManagement>
			<plugins>
				<plugin>
					<artifactId>maven-resources-plugin</artifactId>
					<version>2.7</version>
				</plugin>
				<!-- mixed scala/java compile -->
				<plugin>
					<groupId>org.scala-tools</groupId>
					<artifactId>maven-scala-plugin</artifactId>
					<executions>
						<execution>
							<id>compile</id>
							<goals>
								<goal>compile</goal>
							</goals>
							<phase>compile</phase>
						</execution>
						<execution>
							<id>test-compile</id>
							<goals>
								<goal>testCompile</goal>
							</goals>
							<phase>test-compile</phase>
						</execution>
						<execution>
							<phase>process-resources</phase>
							<goals>
								<goal>compile</goal>
							</goals>
						</execution>
					</executions>
				</plugin>
			</plugins>
		</pluginManagement>
	</build>
</project>

 

The error that occurred is the follow:

Exception in thread "main" java.lang.AssertionError: assertion failed: No predefined schema found, and no Parquet data files or summary files found under hdfs://quickstart.cloudera:8888/user/hive/warehouse/products/bc223562-ee45-42a6-b9a0-05635efb3e59.parquet.

In practice: No predefined schema found, and no Parquet data file..

 

In practice: the path is correct, I have try also other path: the path is not the problem.
In HDFS filesystem there is, in the subfolder of the /user/hive/warehouse/oriducts/.metadata/schemas/1.avsc, so the metadata there is!

 

Maybe I have some trouble with the POM file, but I really don't know where...

 

Can anyone help me?
Thank all!

Announcements