Member since
10-03-2016
11
Posts
1
Kudos Received
0
Solutions
09-20-2017
02:01 AM
Oh!,I could !!! >>> new_df.collect()
[Row(_c0='None', _c1='', _c2='1234', _c3='12345')] Very Thank you and sorry for my poor English I have to learn UDF and English!
... View more
09-19-2017
11:37 AM
Thank you for your reply But error was occurred. Please tell me how to resolve. .. >>> test = spark.read.csv("test/test.csv")
17/09/19 20:28:45 WARN DataSource: Error while looking for metadata directory. >>> test.show()
+----+---+-----+------+
| _c0|_c1| _c2| _c3|
+----+---+-----+------+
|null| ,|12,34|1234,5|
+----+---+-----+------+ >>> import re
>>> >>> from pyspark.sql.functions import UserDefinedFunction >>> >>> from pyspark.sql.types import * >>> udf = UserDefinedFunction(lambda x: re.sub(',','',x), StringType()) >>> new_df = test.select(*[udf(column).alias(column) for column in test.columns]) >>> new_df.collect() 17/09/19 20:29:38 WARN TaskSetManager: Lost task 0.0 in stage 12.0 (TID 30, lxjtddsap048.lixil.lan, executor 2): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/usr/hdp/current/spark2-client/python/pyspark/worker.py", line 174, in main
process()
File "/usr/hdp/current/spark2-client/python/pyspark/worker.py", line 169, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/usr/hdp/current/spark2-client/python/pyspark/serializers.py", line 220, in dump_stream
self.serializer.dump_stream(self._batched(iterator), stream)
File "/usr/hdp/current/spark2-client/python/pyspark/serializers.py", line 138, in dump_stream
for obj in iterator:
File "/usr/hdp/current/spark2-client/python/pyspark/serializers.py", line 209, in _batched
for item in iterator:
File "<string>", line 1, in <lambda>
File "/usr/hdp/current/spark2-client/python/pyspark/worker.py", line 70, in <lambda>
return lambda *a: f(*a)
File "<stdin>", line 1, in <lambda>
File "/usr/lib64/python3.4/re.py", line 179, in sub
return _compile(pattern, flags).sub(repl, string, count)
TypeError: expected string or buffer
at org.apache.spark.api.python.PythonRunner$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.sql.execution.python.BatchEvalPythonExec$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:144)
at org.apache.spark.sql.execution.python.BatchEvalPythonExec$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:87)
at org.apache.spark.rdd.RDD$anonfun$mapPartitions$1$anonfun$apply$23.apply(RDD.scala:797)
at org.apache.spark.rdd.RDD$anonfun$mapPartitions$1$anonfun$apply$23.apply(RDD.scala:797)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
17/09/19 20:29:38 ERROR TaskSetManager: Task 0 in stage 12.0 failed 4 times; aborting job
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/hdp/current/spark2-client/python/pyspark/sql/dataframe.py", line 391, in collect
port = self._jdf.collectToPython()
File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o529.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 12.0 failed 4 times, most recent failure: Lost task 0.3 in stage 12.0 (TID 33, lxjtddsap048.lixil.lan, executor 2): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/usr/hdp/current/spark2-client/python/pyspark/worker.py", line 174, in main
process()
File "/usr/hdp/current/spark2-client/python/pyspark/worker.py", line 169, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/usr/hdp/current/spark2-client/python/pyspark/serializers.py", line 220, in dump_stream
self.serializer.dump_stream(self._batched(iterator), stream)
File "/usr/hdp/current/spark2-client/python/pyspark/serializers.py", line 138, in dump_stream
for obj in iterator:
File "/usr/hdp/current/spark2-client/python/pyspark/serializers.py", line 209, in _batched
for item in iterator:
File "<string>", line 1, in <lambda>
File "/usr/hdp/current/spark2-client/python/pyspark/worker.py", line 70, in <lambda>
return lambda *a: f(*a)
File "<stdin>", line 1, in <lambda>
File "/usr/lib64/python3.4/re.py", line 179, in sub
return _compile(pattern, flags).sub(repl, string, count)
TypeError: expected string or buffer
at org.apache.spark.api.python.PythonRunner$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.sql.execution.python.BatchEvalPythonExec$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:144)
at org.apache.spark.sql.execution.python.BatchEvalPythonExec$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:87)
at org.apache.spark.rdd.RDD$anonfun$mapPartitions$1$anonfun$apply$23.apply(RDD.scala:797)
at org.apache.spark.rdd.RDD$anonfun$mapPartitions$1$anonfun$apply$23.apply(RDD.scala:797)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$failJobAndIndependentStages(DAGScheduler.scala:1435)
at org.apache.spark.scheduler.DAGScheduler$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
at org.apache.spark.scheduler.DAGScheduler$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
at org.apache.spark.scheduler.DAGScheduler$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
at org.apache.spark.scheduler.DAGScheduler$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
at org.apache.spark.util.EventLoop$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1925)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1938)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1951)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1965)
at org.apache.spark.rdd.RDD$anonfun$collect$1.apply(RDD.scala:936)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:275)
at org.apache.spark.sql.Dataset$anonfun$collectToPython$1.apply$mcI$sp(Dataset.scala:2760)
at org.apache.spark.sql.Dataset$anonfun$collectToPython$1.apply(Dataset.scala:2757)
at org.apache.spark.sql.Dataset$anonfun$collectToPython$1.apply(Dataset.scala:2757)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2780)
at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:2757)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/usr/hdp/current/spark2-client/python/pyspark/worker.py", line 174, in main
process()
File "/usr/hdp/current/spark2-client/python/pyspark/worker.py", line 169, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/usr/hdp/current/spark2-client/python/pyspark/serializers.py", line 220, in dump_stream
self.serializer.dump_stream(self._batched(iterator), stream)
File "/usr/hdp/current/spark2-client/python/pyspark/serializers.py", line 138, in dump_stream
for obj in iterator:
File "/usr/hdp/current/spark2-client/python/pyspark/serializers.py", line 209, in _batched
for item in iterator:
File "<string>", line 1, in <lambda>
File "/usr/hdp/current/spark2-client/python/pyspark/worker.py", line 70, in <lambda>
return lambda *a: f(*a)
File "<stdin>", line 1, in <lambda>
File "/usr/lib64/python3.4/re.py", line 179, in sub
return _compile(pattern, flags).sub(repl, string, count)
TypeError: expected string or buffer
at org.apache.spark.api.python.PythonRunner$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.sql.execution.python.BatchEvalPythonExec$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:144)
at org.apache.spark.sql.execution.python.BatchEvalPythonExec$anonfun$doExecute$1.apply(BatchEvalPythonExec.scala:87)
at org.apache.spark.rdd.RDD$anonfun$mapPartitions$1$anonfun$apply$23.apply(RDD.scala:797)
at org.apache.spark.rdd.RDD$anonfun$mapPartitions$1$anonfun$apply$23.apply(RDD.scala:797)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
... 1 more スクリーンショット-2017-09-19-午後202942-午後.png
... View more
09-19-2017
07:52 AM
I want to replace "," to "" with all column for example I want to replace "," to "" should I do ? +----+---+-----+------+ | _c0|_c1| _c2| _c3| +----+---+-----+------+ |null| ,|12,34|1234,5|
+----+---+-----+------+ ↓↓↓ +----+---+-----+------+ | _c0|_c1| _c2| _c3| +----+---+-----+------+ |null||1234|12345| +----+---+-----+------+
... View more
Labels:
- Labels:
-
Apache Spark
04-27-2017
06:43 AM
I try to truncate table on Hive but error Occur should i do? hive> truncate table ds_0co_om_cca_1_d_enr_temp;
FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. Exception while processing
hive>
... View more
Labels:
- Labels:
-
Apache Hadoop
-
Apache Hive
12-19-2016
08:24 AM
I did. Thank you!
... View more
12-09-2016
07:00 AM
Does Drill support High Available (HA) Hadoop? I try to use hive with drill. but following error ocurr 0: jdbc:drill:zk=XXXX:2181> select *from hive.test_db.bkpf limit 100;
Error: SYSTEM ERROR: UnknownHostException: XXXX [Error Id: da1f6baa-6128-4449-b82a-f89be89f9bb3 on XXXX:31010] (state=,code=0)
... View more
Labels:
- Labels:
-
Apache Hadoop
-
Apache Hive
11-21-2016
05:55 AM
Yes I did. I had to change every eth0 in vora manager UI as well. Now, Vora is running fine.
... View more
11-16-2016
11:48 PM
yes! First,Vora did not run because of eth0. But I checked ipconfig,I change eth0, then vora started fine
... View more
11-16-2016
04:43 AM
Thank you,I foreget following steps, I did following steps,I was able to install vora 1. Open etc/sudoers file by using command sudo visudo. 2. Disable requiretty by setting parameter like: Defaults !requiretty. 3. Save the file.
... View more
11-14-2016
07:25 AM
1 Kudo
Traceback (most recent call last) エラーが出てしまいました…
File "/var/lib/ambari-agent/cache/stacks/HDP/2.4/services/vora-manager/package/scripts/vora_client.py", line 38, in <module>
VoraClient().execute()
File "/usr/lib/python2.6/site-packages/resource_management/libraries/script/script.py", line 219, in execute
method(env)
File "/var/lib/ambari-agent/cache/stacks/HDP/2.4/services/vora-manager/package/scripts/vora_client.py", line 24, in install
'PLATFORM': params.platform
File "/usr/lib/python2.6/site-packages/resource_management/core/base.py", line 154, in __init__
self.env.run()
File "/usr/lib/python2.6/site-packages/resource_management/core/environment.py", line 160, in run
self.run_action(resource, action)
File "/usr/lib/python2.6/site-packages/resource_management/core/environment.py", line 124, in run_action
provider_action()
File "/usr/lib/python2.6/site-packages/resource_management/core/providers/system.py", line 238, in action_run
tries=self.resource.tries, try_sleep=self.resource.try_sleep)
File "/usr/lib/python2.6/site-packages/resource_management/core/shell.py", line 70, in inner
result = function(command, **kwargs)
File "/usr/lib/python2.6/site-packages/resource_management/core/shell.py", line 92, in checked_call
tries=tries, try_sleep=try_sleep)
File "/usr/lib/python2.6/site-packages/resource_management/core/shell.py", line 140, in _call_wrapper
result = _call(command, **kwargs_copy)
File "/usr/lib/python2.6/site-packages/resource_management/core/shell.py", line 291, in _call
raise Fail(err_msg)
resource_management.core.exceptions.Fail: Execution of '/var/lib/ambari-agent/cache/stacks/HDP/2.4/services/vora-manager/package/scripts/control.py install_vora_client' returned 1.
... View more
Labels: