Created on 04-09-2023 08:56 PM - edited on 04-11-2023 12:17 AM by VidyaSargur
To display the query metrics of effective runs of Analyzer/Optimizer Rules, we need to use the RuleExecutor object.
RuleExecutor metrics will help us to identify which rule is taking more time.
object RuleExecutor {
protected val queryExecutionMeter = QueryExecutionMetering()
/** Dump statistics about time spent running specific rules. */
def dumpTimeSpent(): String = {
queryExecutionMeter.dumpTimeSpent()
}
/** Resets statistics about time spent running specific rules */
def resetMetrics(): Unit = {
queryExecutionMeter.resetMetrics()
}
def getCurrentMetrics(): QueryExecutionMetrics = {
queryExecutionMeter.getMetrics()
}
}
Display the query metrics using Scala code:
import org.apache.spark.sql.catalyst.rules.RuleExecutor
var df = spark.range(100).toDF()
// Do this first or your values will be cumulative
RuleExecutor.resetMetrics()
for (i <- 1 to 500) {
df = df.withColumn("id_" + i, col("id") + i)
}
println(RuleExecutor.dumpTimeSpent())
Output:
=== Metrics of Analyzer/Optimizer Rules ===
Total number of runs: 84000
Total time: 3.88783617 seconds
Rule Effective Time / Total Time Effective Runs / Total Runs
org.apache.spark.sql.catalyst.analysis.TypeCoercion$ImplicitTypeCasts 440769504 / 514930128 500 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences 316457344 / 452531027 500 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ExtractGenerator 0 / 225949844 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$FixNullability 0 / 218911510 0 / 500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveDeserializer 0 / 164356553 0 / 1500
org.apache.spark.sql.catalyst.analysis.TimeWindowing 0 / 141921674 0 / 1500
org.apache.spark.sql.catalyst.analysis.ResolveTimeZone 61465421 / 109227965 500 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$GlobalAggregates 0 / 98820566 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRandomSeed 0 / 95189919 0 / 1500
org.apache.spark.sql.catalyst.analysis.DecimalPrecision 0 / 87679682 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$ConcatCoercion 0 / 84910897 0 / 1500
org.apache.spark.sql.catalyst.analysis.ResolveCreateNamedStruct 0 / 83477029 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowFrame 0 / 80862398 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$PromoteStrings 0 / 79603933 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$DateTimeOperations 0 / 77833267 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$InConversion 0 / 75004425 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$IfCoercion 0 / 70605659 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$FunctionArgumentConversion 0 / 68207411 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$BooleanEquality 0 / 63671046 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveFunctions 0 / 62710050 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveSubquery 0 / 60797560 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$EltCoercion 0 / 59701827 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$Division 0 / 58397231 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowOrder 0 / 57996555 0 / 1500
org.apache.spark.sql.catalyst.analysis.ResolveHigherOrderFunctions 0 / 55920753 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$CaseWhenCoercion 0 / 55915403 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$WindowFrameCoercion 0 / 55282298 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$StackCoercion 0 / 55127879 0 / 1500
org.apache.spark.sql.catalyst.analysis.CleanupAliases 25547573 / 48980732 500 / 1000
org.apache.spark.sql.catalyst.analysis.ResolveLambdaVariables 0 / 40694858 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveUpCast 0 / 29383949 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$HandleNullInputsForUDF 0 / 29069226 0 / 500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveNewInstance 0 / 28936950 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ExtractWindowExpressions 0 / 27205184 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAliases 0 / 26819109 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$LookupFunctions 0 / 25447455 0 / 500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveGenerate 0 / 19031939 0 / 1500
org.apache.spark.sql.execution.datasources.ResolveSQLOnFile 0 / 18588169 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveGroupingAnalytics 0 / 14494796 0 / 1500
org.apache.spark.sql.catalyst.analysis.ResolveInlineTables 0 / 13098274 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations 0 / 13007322 0 / 1500
org.apache.spark.sql.execution.datasources.FindDataSourceTable 0 / 12953222 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveNaturalAndUsingJoin 0 / 12934635 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$WidenSetOperationTypes 0 / 12923604 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolvePivot 0 / 12910359 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveMissingReferences 0 / 12652452 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveSubqueryColumnAliases 0 / 12609578 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveOrdinalInOrderByAndGroupBy 0 / 12577636 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAggregateFunctions 0 / 12549282 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAggAliasInGroupBy 0 / 12509176 0 / 1500
org.apache.spark.sql.catalyst.analysis.ResolveTableValuedFunctions 0 / 12436875 0 / 1500
org.apache.spark.sql.hive.ResolveHiveSerdeTable 0 / 12354180 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveOutputRelation 0 / 12323360 0 / 1500
org.apache.spark.sql.catalyst.analysis.ResolveHints$ResolveBroadcastHints 0 / 11305271 0 / 500
org.apache.spark.sql.catalyst.analysis.ResolveHints$ResolveCoalesceHints 0 / 5869056 0 / 500
org.apache.spark.sql.catalyst.analysis.UpdateOuterReferences 0 / 5663024 0 / 500
org.apache.spark.sql.catalyst.analysis.AliasViewChild 0 / 5435219 0 / 500
org.apache.spark.sql.catalyst.analysis.Analyzer$PullOutNondeterministic 0 / 5325264 0 / 500
org.apache.spark.sql.execution.datasources.DataSourceAnalysis 0 / 5177154 0 / 500
org.apache.spark.sql.execution.datasources.PreprocessTableCreation 0 / 5001442 0 / 500
org.apache.spark.sql.catalyst.analysis.ResolveHints$RemoveAllHints 0 / 4902903 0 / 500
org.apache.spark.sql.hive.DetermineTableStats 0 / 4890450 0 / 500
org.apache.spark.sql.catalyst.analysis.Analyzer$CTESubstitution 0 / 4862617 0 / 500
org.apache.spark.sql.hive.RelationConversions 0 / 4755488 0 / 500
org.apache.spark.sql.hive.HiveAnalysis 0 / 4703621 0 / 500
org.apache.spark.sql.catalyst.analysis.EliminateUnions 0 / 4532930 0 / 500
org.apache.spark.sql.execution.datasources.PreprocessTableInsertion 0 / 4528303 0 / 500
org.apache.spark.sql.catalyst.analysis.Analyzer$WindowsSubstitution 0 / 4486339 0 / 500
org.apache.spark.sql.catalyst.analysis.SubstituteUnresolvedOrdinals 0 / 4360278 0 / 500
Display the query metrics using Python code:
>>> df = spark.range(100)
RuleExecutor = spark._jvm.org.apache.spark.sql.catalyst.rules.RuleExecutor
RuleExecutor.resetMetrics()
for i in range(500):
df = df.withColumn("id_" + str(i), df["id"] + str(i))
print(RuleExecutor.dumpTimeSpent())
Output:
=== Metrics of Analyzer/Optimizer Rules ===
Total number of runs: 84000
Total time: 4.083792217 seconds
Rule Effective Time / Total Time Effective Runs / Total Runs
org.apache.spark.sql.catalyst.analysis.TypeCoercion$PromoteStrings 428130064 / 486834072 500 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$ImplicitTypeCasts 386199112 / 471189375 500 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$FixNullability 0 / 267973019 0 / 500
org.apache.spark.sql.catalyst.analysis.Analyzer$ExtractGenerator 0 / 234887998 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences 0 / 221097152 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveDeserializer 0 / 187387962 0 / 1500
org.apache.spark.sql.catalyst.analysis.TimeWindowing 0 / 169662648 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$GlobalAggregates 0 / 118567324 0 / 1500
org.apache.spark.sql.catalyst.analysis.DecimalPrecision 0 / 114827165 0 / 1500
org.apache.spark.sql.catalyst.analysis.ResolveTimeZone 66654151 / 114117034 500 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRandomSeed 0 / 94635230 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$FunctionArgumentConversion 0 / 75385981 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$InConversion 0 / 74819084 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$DateTimeOperations 0 / 71308118 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$WindowFrameCoercion 0 / 66340413 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$BooleanEquality 0 / 60474458 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveFunctions 0 / 60323978 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveSubquery 0 / 60093349 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$IfCoercion 0 / 59012355 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$Division 0 / 57692382 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$ConcatCoercion 0 / 57007739 0 / 1500
org.apache.spark.sql.catalyst.analysis.ResolveCreateNamedStruct 0 / 55960927 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$EltCoercion 0 / 55746694 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowOrder 0 / 54499655 0 / 1500
org.apache.spark.sql.catalyst.analysis.ResolveHigherOrderFunctions 0 / 54407217 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveWindowFrame 0 / 53640536 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$CaseWhenCoercion 0 / 53254689 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$StackCoercion 0 / 52972883 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveNewInstance 0 / 40538255 0 / 1500
org.apache.spark.sql.catalyst.analysis.ResolveLambdaVariables 0 / 39820352 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ExtractWindowExpressions 0 / 39517598 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAliases 0 / 37882907 0 / 1500
org.apache.spark.sql.catalyst.analysis.CleanupAliases 17699843 / 33339362 500 / 1000
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveUpCast 0 / 28576611 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$LookupFunctions 0 / 26537381 0 / 500
org.apache.spark.sql.catalyst.analysis.Analyzer$HandleNullInputsForUDF 0 / 20261221 0 / 500
org.apache.spark.sql.catalyst.analysis.ResolveInlineTables 0 / 14907118 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveGroupingAnalytics 0 / 14088163 0 / 1500
org.apache.spark.sql.hive.ResolveHiveSerdeTable 0 / 13454797 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations 0 / 13337544 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveGenerate 0 / 13324686 0 / 1500
org.apache.spark.sql.catalyst.analysis.ResolveTableValuedFunctions 0 / 13170550 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveOutputRelation 0 / 13104878 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAggregateFunctions 0 / 13068603 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveMissingReferences 0 / 13012218 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolvePivot 0 / 12952997 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveOrdinalInOrderByAndGroupBy 0 / 12928211 0 / 1500
org.apache.spark.sql.catalyst.analysis.TypeCoercion$WidenSetOperationTypes 0 / 12859184 0 / 1500
org.apache.spark.sql.execution.datasources.FindDataSourceTable 0 / 12798249 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveSubqueryColumnAliases 0 / 12657792 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveNaturalAndUsingJoin 0 / 12643004 0 / 1500
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAggAliasInGroupBy 0 / 12455529 0 / 1500
org.apache.spark.sql.execution.datasources.ResolveSQLOnFile 0 / 12143355 0 / 1500
org.apache.spark.sql.catalyst.analysis.ResolveHints$ResolveBroadcastHints 0 / 12042103 0 / 500
org.apache.spark.sql.catalyst.analysis.ResolveHints$ResolveCoalesceHints 0 / 6474487 0 / 500
org.apache.spark.sql.catalyst.analysis.UpdateOuterReferences 0 / 5663492 0 / 500
org.apache.spark.sql.catalyst.analysis.AliasViewChild 0 / 5625356 0 / 500
org.apache.spark.sql.hive.RelationConversions 0 / 5533718 0 / 500
org.apache.spark.sql.catalyst.analysis.Analyzer$PullOutNondeterministic 0 / 5348453 0 / 500
org.apache.spark.sql.catalyst.analysis.Analyzer$WindowsSubstitution 0 / 5247319 0 / 500
org.apache.spark.sql.catalyst.analysis.Analyzer$CTESubstitution 0 / 5239071 0 / 500
org.apache.spark.sql.execution.datasources.PreprocessTableCreation 0 / 5216258 0 / 500
org.apache.spark.sql.catalyst.analysis.ResolveHints$RemoveAllHints 0 / 5027277 0 / 500
org.apache.spark.sql.catalyst.analysis.EliminateUnions 0 / 5003055 0 / 500
org.apache.spark.sql.catalyst.analysis.SubstituteUnresolvedOrdinals 0 / 4939426 0 / 500
org.apache.spark.sql.execution.datasources.DataSourceAnalysis 0 / 4930642 0 / 500
org.apache.spark.sql.hive.HiveAnalysis 0 / 4709238 0 / 500
org.apache.spark.sql.hive.DetermineTableStats 0 / 4658002 0 / 500
org.apache.spark.sql.execution.datasources.PreprocessTableInsertion 0 / 4634318 0 / 500