Reply
Highlighted
New Contributor
Posts: 2
Registered: ‎06-14-2017

Issue in generating auto increment id for unique id

package com.mch.hive.hiveudf;

import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.udf.UDFType;

@UDFType(deterministic = false)//, stateful = true)
public class MchAutoIncrementIdUDF extends UDF {

long Incremental_Start_Value = 0; // Taking a variable to assign default value

static long flag = 0;

public String evaluate(Long input_Emp_Id, Long incremental_Value) {   // Taking Employee_Id and incremental values as parameters to method

String result;

if (flag == 0)   // Checking if Initial value is Zero

{ Incremental_Start_Value = incremental_Value; flag++; }

result = Incremental_Start_Value + "_" + input_Emp_Id; // Concatenating both employee_Id and input incremental value

Incremental_Start_Value++;

return result; //Returning Final value like eg: 101_123456789

}

}

Here is the DDL and UDF Table DDL:

 

CREATE TABLE IF NOT EXISTS w_encounter_inc(
encounter_id int,
udf_updated_encounter_id string,
medical_record_number string,
financial_number string,
patient_account_number double,
ms_drg_id string,
ms_drg_description string,
apr_drg_id string,
apr_drg_description string,
rom_code_value double,
apr_risk_of_mortality string,
soi_code_value double,
apr_severity_of_illness string,
ms_mdc_code_id double,
ms_mdc_code_description string,
apr_mdc_code_id double,
apr_mdc_code_description string,
ms_drg_weight double,
ms_gmlos double,
ms_amlos double,
primary_health_plan_id double,
secondary_health_plan_id double,
primary_plan_type_code double,
default_guarantor_id double,
family_member_id double,
attending_physician_id double,
admitting_physician_id double,
referring_physician_id double,
primary_care_physician_id double,
consult_physician_1_id double,
consult_physician_2_id double,
consult_physician_3_id double,
consult_physician_4_id double,
admit_location_code double,
admit_admit_type_code double,
admit_location_facility_code double,
admit_location_building_code double,
admit_location_room_code double,
admit_location_bed_code double,
admit_arrive_date string,
admit_depart_date string,
disch_location_code double,
disch_admit_type_code double,
disch_location_facility_code double,
disch_location_building_code double,
disch_location_room_code double,
disch_location_bed_code double,
disch_arrive_date string,
disch_depart_date string,
race_code double,
race_display string,
ethnic_group_code double,
ethnic_group string,
language_code double,
language_display string,
sex_code double,
sex string,
religion_code double,
religion_display string,
person_id double,
encounter_class_code double,
encounter_type_code double,
encounter_type_class_code double,
encounter_status_code double,
pre_reg_date string,
pre_reg_personnel_id double,
reg_date string,
reg_personnel_id double,
est_arrive_date string,
est_depart_date string,
arrive_date string,
depart_date string,
legacy_discharge_day string,
admission_day string,
admit_type_code double,
admit_src_code double,
admit_mode_code double,
referring_comment string,
disch_disposition_code double,
disch_to_location_code double,
accommodation_code double,
accommodation_request_code double,
ambulatory_cond_code double,
courtesy_code double,
isolation_code double,
med_service_code double,
confid_level_code double,
location_code double,
location_facility_code double,
location_building_code double,
location_nurse_unit_code double,
location_room_code double,
location_bed_code double,
disch_date string,
location_temp_code double,
organization_id double,
reason_for_visit string,
encounter_financial_id double,
financial_class_code double,
trauma_code double,
triage_date string,
visitor_status_code double,
refer_facility_code double,
trauma_date string,
accommodation_reason_code double,
encounter_complete_date string,
assign_to_location_date string,
inpatient_admit_date string,
legacy_admission_day string,
initial_bill_day string,
last_bill_cycle_day string,
current_balance double,
balance_of_unbilled_charges double,
total_charges_for_account double,
total_adj_for_this_account double,
total_payments_for_account double,
expected_payments double,
last_payment_day string,
last_payment_amount double,
balance_on_last_statement double,
total_costs double,
total_fixed_costs double,
total_variable_costs double,
total_direct_costs double,
total_indirect_costs double,
surgery_start_date string,
surgeon_physician_id double,
dw_creation_date string,
dw_update_date string,
tertiary_health_plan_id double,
quaternary_health_plan_id double,
legacy_payor_id1 string,
legacy_payor_id2 string,
legacy_payor_id3 string,
legacy_payor_id4 string,
admission_date string,
discharge_date string,
pb_ins_payments_applied double,
pb_ins_adjustments_applied double,
pb_patient_payments_applied double,
pb_patient_adjustments_applied double,
pb_withhold_applied double,
pb_approved_amount double,
pb_value double,
pb_value_balance double,
pb_amount double,
pb_balance double,
pb_rvu_work double,
primary_diagnosis_number string,
primary_diagnosis_vocabulary string,
primary_procedure_number string,
primary_procedure_vocabulary string,
travel_indicator string,
frequent_flier_flag string,
marketing_analytic_id string,
mrn_alias_pool_display string,
visit_id string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

 

 

add jar /home/MCH/jthatipa/backup/AutoIncLong123.jar;


create function INCREMENT_SEQID as 'com.mch.hive.hiveudf.MchAutoIncrementIdUDF';

set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;

set mapred.map.tasks = 01;

 


INSERT INTO TABLE udfautoincrement.w_encounter_inc
SELECT encounter_id,
INC_NUMBERID(cast(ENCOUNTER_ID as int), 1) as udf_updated_encounter_id,
medical_record_number,
financial_number,
patient_account_number,
ms_drg_id,
ms_drg_description,
apr_drg_id,
apr_drg_description,
rom_code_value,
apr_risk_of_mortality,
soi_code_value,
apr_severity_of_illness,
ms_mdc_code_id,
ms_mdc_code_description,
apr_mdc_code_id,
apr_mdc_code_description,
ms_drg_weight,
ms_gmlos,
ms_amlos,
primary_health_plan_id,
secondary_health_plan_id,
primary_plan_type_code,
default_guarantor_id,
family_member_id,
attending_physician_id,
admitting_physician_id,
referring_physician_id,
primary_care_physician_id,
consult_physician_1_id,
consult_physician_2_id,
consult_physician_3_id,
consult_physician_4_id,
admit_location_code,
admit_admit_type_code,
admit_location_facility_code,
admit_location_building_code,
admit_location_room_code,
admit_location_bed_code,
admit_arrive_date,
admit_depart_date,
disch_location_code,
disch_admit_type_code,
disch_location_facility_code,
disch_location_building_code,
disch_location_room_code,
disch_location_bed_code,
disch_arrive_date,
disch_depart_date,
race_code,
race_display,
ethnic_group_code,
ethnic_group,
language_code,
language_display,
sex_code,
sex,
religion_code,
religion_display,
person_id,
encounter_class_code,
encounter_type_code,
encounter_type_class_code,
encounter_status_code,
pre_reg_date,
pre_reg_personnel_id,
reg_date,
reg_personnel_id,
est_arrive_date,
est_depart_date,
arrive_date,
depart_date,
legacy_discharge_day,
admission_day,
admit_type_code,
admit_src_code,
admit_mode_code,
referring_comment,
disch_disposition_code,
disch_to_location_code,
accommodation_code,
accommodation_request_code,
ambulatory_cond_code,
courtesy_code,
isolation_code,
med_service_code,
confid_level_code,
location_code,
location_facility_code,
location_building_code,
location_nurse_unit_code,
location_room_code,
location_bed_code,
disch_date,
location_temp_code,
organization_id,
reason_for_visit,
encounter_financial_id,
financial_class_code,
trauma_code,
triage_date,
visitor_status_code,
refer_facility_code,
trauma_date,
accommodation_reason_code,
encounter_complete_date,
assign_to_location_date,
inpatient_admit_date,
legacy_admission_day,
initial_bill_day,
last_bill_cycle_day,
current_balance,
balance_of_unbilled_charges,
total_charges_for_account,
total_adj_for_this_account,
total_payments_for_account,
expected_payments,
last_payment_day,
last_payment_amount,
balance_on_last_statement,
total_costs,
total_fixed_costs,
total_variable_costs,
total_direct_costs,
total_indirect_costs,
surgery_start_date,
surgeon_physician_id,
dw_creation_date,
dw_update_date,
tertiary_health_plan_id,
quaternary_health_plan_id,
legacy_payor_id1,
legacy_payor_id2,
legacy_payor_id3,
legacy_payor_id4,
admission_date,
discharge_date,
pb_ins_payments_applied,
pb_ins_adjustments_applied,
pb_patient_payments_applied,
pb_patient_adjustments_applied,
pb_withhold_applied,
pb_approved_amount,
pb_value,
pb_value_balance,
pb_amount,
pb_balance,
pb_rvu_work,
primary_diagnosis_number,
primary_diagnosis_vocabulary,
primary_procedure_number,
primary_procedure_vocabulary,
travel_indicator,
frequent_flier_flag,
marketing_analytic_id,
mrn_alias_pool_display,
visit_id from udfautoincrement.w_encounter_may23 ;

 

Here is the Execution:

 

45900409 1 7954727
45900412 1 7954728
45900416 1 7954729
45900419 1 7954730
45900425 1 7954731
45900429 1 7954732
45900432 1 7954733
45900436 1 7954734
45900439 1 7954735
45900443 1 7954736
45900446 1 7954737
45900449 1 7954738
45900452 1 7954739
45900456 1 7954740
Time taken: 183.656 seconds, Fetched: 7954740 row(s)
hive> create table encounter_id_count_window as select encounter_id, count(encounter_id) over (partition by encounter_id) from w_encounter_seqid group by enc
Query ID = jthatipa_20170613123131_3478898c-1a41-4611-8741-612745621319
Total jobs = 2
Launching Job 1 out of 2
Number of reduce tasks not specified. Estimated from input data size: 134
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1481171450798_0411, Tracking URL = https://svrhdpm03.mch.corp.int:8090/proxy/application_1481171450798_0411/
Kill Command = /opt/cloudera/parcels/CDH-5.7.1-1.cdh5.7.1.p0.11/lib/hadoop/bin/hadoop job -kill job_1481171450798_0411
Hadoop job information for Stage-1: number of mappers: 32; number of reducers: 134
2017-06-13 12:31:20,482 Stage-1 map = 0%, reduce = 0%
2017-06-13 12:31:29,859 Stage-1 map = 3%, reduce = 0%, Cumulative CPU 3.82 sec
2017-06-13 12:31:31,947 Stage-1 map = 69%, reduce = 0%, Cumulative CPU 147.38 sec
2017-06-13 12:31:32,995 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 227.39 sec
2017-06-13 12:31:44,642 Stage-1 map = 100%, reduce = 1%, Cumulative CPU 233.21 sec
2017-06-13 12:31:45,862 Stage-1 map = 100%, reduce = 12%, Cumulative CPU 336.67 sec
2017-06-13 12:31:46,921 Stage-1 map = 100%, reduce = 93%, Cumulative CPU 1073.95 sec
2017-06-13 12:31:47,971 Stage-1 map = 100%, reduce = 98%, Cumulative CPU 1134.23 sec
2017-06-13 12:31:49,008 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 1155.42 sec
MapReduce Total cumulative CPU time: 19 minutes 15 seconds 420 msec
Ended Job = job_1481171450798_0411
Launching Job 2 out of 2
Number of reduce tasks not specified. Estimated from input data size: 3
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1481171450798_0412, Tracking URL = https://svrhdpm03.mch.corp.int:8090/proxy/application_1481171450798_0412/
Kill Command = /opt/cloudera/parcels/CDH-5.7.1-1.cdh5.7.1.p0.11/lib/hadoop/bin/hadoop job -kill job_1481171450798_0412
Hadoop job information for Stage-2: number of mappers: 2; number of reducers: 3
2017-06-13 12:32:02,433 Stage-2 map = 0%, reduce = 0%
2017-06-13 12:32:15,021 Stage-2 map = 37%, reduce = 0%, Cumulative CPU 21.26 sec
2017-06-13 12:32:18,149 Stage-2 map = 50%, reduce = 0%, Cumulative CPU 29.19 sec
2017-06-13 12:32:19,197 Stage-2 map = 67%, reduce = 0%, Cumulative CPU 30.15 sec
2017-06-13 12:32:21,287 Stage-2 map = 73%, reduce = 0%, Cumulative CPU 33.95 sec
2017-06-13 12:32:23,379 Stage-2 map = 79%, reduce = 0%, Cumulative CPU 37.9 sec
2017-06-13 12:32:26,518 Stage-2 map = 83%, reduce = 0%, Cumulative CPU 41.63 sec
2017-06-13 12:32:32,771 Stage-2 map = 100%, reduce = 0%, Cumulative CPU 48.1 sec
2017-06-13 12:32:44,258 Stage-2 map = 100%, reduce = 28%, Cumulative CPU 59.55 sec
2017-06-13 12:32:46,349 Stage-2 map = 100%, reduce = 33%, Cumulative CPU 63.23 sec
2017-06-13 12:32:47,393 Stage-2 map = 100%, reduce = 97%, Cumulative CPU 87.96 sec
2017-06-13 12:32:48,440 Stage-2 map = 100%, reduce = 100%, Cumulative CPU 89.67 sec
MapReduce Total cumulative CPU time: 1 minutes 29 seconds 670 msec
Ended Job = job_1481171450798_0412
Moving data to: hdfs://nameservice1/user/hive/warehouse/udfautoincrement.db/encounter_id_count_window
Table udfautoincrement.encounter_id_count_window stats: [numFiles=3, numRows=7954740, totalSize=85724875, rawDataSize=77770135]
MapReduce Jobs Launched:
Stage-Stage-1: Map: 32 Reduce: 134 Cumulative CPU: 1155.42 sec HDFS Read: 8945699403 HDFS Write: 172681731 SUCCESS
Stage-Stage-2: Map: 2 Reduce: 3 Cumulative CPU: 89.67 sec HDFS Read: 172732010 HDFS Write: 85725202 SUCCESS
Total MapReduce CPU Time Spent: 20 minutes 45 seconds 90 msec
OK
Time taken: 102.559 seconds
hive> describe encounter_id_count_window;
OK
encounter_id int
_wcol0 bigint
Time taken: 0.092 seconds, Fetched: 2 row(s)
hive> select * from encounter_id_count_window where _wcol0>1;
FAILED: ParseException line 1:52 missing \' at '>' near '<EOF>'
hive> select * from encounter_id_count_window where _wcol0 > 1;
FAILED: ParseException line 1:53 missing \' at '>' near '<EOF>'
hive> select * from encounter_id_count_window where `_wcol0` > 1;
Query ID = jthatipa_20170613124040_66521522-8841-4e40-8fca-d4eb9fae2bd6
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_1481171450798_0413, Tracking URL = https://svrhdpm03.mch.corp.int:8090/proxy/application_1481171450798_0413/
Kill Command = /opt/cloudera/parcels/CDH-5.7.1-1.cdh5.7.1.p0.11/lib/hadoop/bin/hadoop job -kill job_1481171450798_0413
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2017-06-13 12:40:14,641 Stage-1 map = 0%, reduce = 0%
2017-06-13 12:40:27,094 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 5.66 sec
MapReduce Total cumulative CPU time: 5 seconds 660 msec
Ended Job = job_1481171450798_0413
MapReduce Jobs Launched:
Stage-Stage-1: Map: 1 Cumulative CPU: 5.66 sec HDFS Read: 85728587 HDFS Write: 0 SUCCESS
Total MapReduce CPU Time Spent: 5 seconds 660 msec
OK
Time taken: 23.897 seconds
hive> select encounter_id,count(*) from w_encounter_seqid group by encounter_id having count(*)>1;
Query ID = jthatipa_20170613124343_354a244d-0134-498e-b988-66d9c1a9da16
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 134
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1481171450798_0414, Tracking URL = https://svrhdpm03.mch.corp.int:8090/proxy/application_1481171450798_0414/
Kill Command = /opt/cloudera/parcels/CDH-5.7.1-1.cdh5.7.1.p0.11/lib/hadoop/bin/hadoop job -kill job_1481171450798_0414
Hadoop job information for Stage-1: number of mappers: 32; number of reducers: 134
2017-06-13 12:43:28,242 Stage-1 map = 0%, reduce = 0%
2017-06-13 12:43:37,633 Stage-1 map = 3%, reduce = 0%, Cumulative CPU 4.13 sec
2017-06-13 12:43:39,737 Stage-1 map = 56%, reduce = 0%, Cumulative CPU 109.49 sec
2017-06-13 12:43:40,784 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 232.82 sec
2017-06-13 12:43:53,427 Stage-1 map = 100%, reduce = 22%, Cumulative CPU 423.33 sec
2017-06-13 12:43:54,478 Stage-1 map = 100%, reduce = 74%, Cumulative CPU 898.92 sec
2017-06-13 12:43:55,528 Stage-1 map = 100%, reduce = 94%, Cumulative CPU 1088.15 sec
2017-06-13 12:43:56,577 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 1140.97 sec
MapReduce Total cumulative CPU time: 19 minutes 0 seconds 970 msec
Ended Job = job_1481171450798_0414
MapReduce Jobs Launched:
Stage-Stage-1: Map: 32 Reduce: 134 Cumulative CPU: 1140.97 sec HDFS Read: 8945936979 HDFS Write: 0 SUCCESS
Total MapReduce CPU Time Spent: 19 minutes 0 seconds 970 msec
OK
Time taken: 42.543 seconds
hive>SELECT udf_updated_encounter_id, row_number() OVER (ORDER BY udf_updated_encounter_id) as row_number from w_encounter_seqid;

 

//some dat ais there before cant able to upload all data

9_6812064 7954733
9_7258820 7954734
9_7667038 7954735
9_8107154 7954736
9_8511292 7954737
9_8935037 7954738
9_9394735 7954739
9_9809614 7954740

 //Expecting result as 7954740_9809614 7954740 but getting 9_9809614 7954740.

 

please check all the code and process once and let me know the solution for this.

 

Announcements