Support Questions
Find answers, ask questions, and share your expertise
Announcements
Alert: Welcome to the Unified Cloudera Community. Former HCC members be sure to read and learn how to activate your account here.

Issue in generating auto increment id for unique id

Highlighted

Issue in generating auto increment id for unique id

New Contributor
package com.mch.hive.hiveudf;

import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.udf.UDFType;

@UDFType(deterministic = false)//, stateful = true)
public class MchAutoIncrementIdUDF extends UDF {

long Incremental_Start_Value = 0; // Taking a variable to assign default value

static long flag = 0;

public String evaluate(Long input_Emp_Id, Long incremental_Value) {   // Taking Employee_Id and incremental values as parameters to method

String result;

if (flag == 0)   // Checking if Initial value is Zero

{ Incremental_Start_Value = incremental_Value; flag++; }

result = Incremental_Start_Value + "_" + input_Emp_Id; // Concatenating both employee_Id and input incremental value

Incremental_Start_Value++;

return result; //Returning Final value like eg: 101_123456789

}

}

Here is the DDL and UDF Table DDL:

 

CREATE TABLE IF NOT EXISTS w_encounter_inc(
encounter_id int,
udf_updated_encounter_id string,
medical_record_number string,
financial_number string,
patient_account_number double,
ms_drg_id string,
ms_drg_description string,
apr_drg_id string,
apr_drg_description string,
rom_code_value double,
apr_risk_of_mortality string,
soi_code_value double,
apr_severity_of_illness string,
ms_mdc_code_id double,
ms_mdc_code_description string,
apr_mdc_code_id double,
apr_mdc_code_description string,
ms_drg_weight double,
ms_gmlos double,
ms_amlos double,
primary_health_plan_id double,
secondary_health_plan_id double,
primary_plan_type_code double,
default_guarantor_id double,
family_member_id double,
attending_physician_id double,
admitting_physician_id double,
referring_physician_id double,
primary_care_physician_id double,
consult_physician_1_id double,
consult_physician_2_id double,
consult_physician_3_id double,
consult_physician_4_id double,
admit_location_code double,
admit_admit_type_code double,
admit_location_facility_code double,
admit_location_building_code double,
admit_location_room_code double,
admit_location_bed_code double,
admit_arrive_date string,
admit_depart_date string,
disch_location_code double,
disch_admit_type_code double,
disch_location_facility_code double,
disch_location_building_code double,
disch_location_room_code double,
disch_location_bed_code double,
disch_arrive_date string,
disch_depart_date string,
race_code double,
race_display string,
ethnic_group_code double,
ethnic_group string,
language_code double,
language_display string,
sex_code double,
sex string,
religion_code double,
religion_display string,
person_id double,
encounter_class_code double,
encounter_type_code double,
encounter_type_class_code double,
encounter_status_code double,
pre_reg_date string,
pre_reg_personnel_id double,
reg_date string,
reg_personnel_id double,
est_arrive_date string,
est_depart_date string,
arrive_date string,
depart_date string,
legacy_discharge_day string,
admission_day string,
admit_type_code double,
admit_src_code double,
admit_mode_code double,
referring_comment string,
disch_disposition_code double,
disch_to_location_code double,
accommodation_code double,
accommodation_request_code double,
ambulatory_cond_code double,
courtesy_code double,
isolation_code double,
med_service_code double,
confid_level_code double,
location_code double,
location_facility_code double,
location_building_code double,
location_nurse_unit_code double,
location_room_code double,
location_bed_code double,
disch_date string,
location_temp_code double,
organization_id double,
reason_for_visit string,
encounter_financial_id double,
financial_class_code double,
trauma_code double,
triage_date string,
visitor_status_code double,
refer_facility_code double,
trauma_date string,
accommodation_reason_code double,
encounter_complete_date string,
assign_to_location_date string,
inpatient_admit_date string,
legacy_admission_day string,
initial_bill_day string,
last_bill_cycle_day string,
current_balance double,
balance_of_unbilled_charges double,
total_charges_for_account double,
total_adj_for_this_account double,
total_payments_for_account double,
expected_payments double,
last_payment_day string,
last_payment_amount double,
balance_on_last_statement double,
total_costs double,
total_fixed_costs double,
total_variable_costs double,
total_direct_costs double,
total_indirect_costs double,
surgery_start_date string,
surgeon_physician_id double,
dw_creation_date string,
dw_update_date string,
tertiary_health_plan_id double,
quaternary_health_plan_id double,
legacy_payor_id1 string,
legacy_payor_id2 string,
legacy_payor_id3 string,
legacy_payor_id4 string,
admission_date string,
discharge_date string,
pb_ins_payments_applied double,
pb_ins_adjustments_applied double,
pb_patient_payments_applied double,
pb_patient_adjustments_applied double,
pb_withhold_applied double,
pb_approved_amount double,
pb_value double,
pb_value_balance double,
pb_amount double,
pb_balance double,
pb_rvu_work double,
primary_diagnosis_number string,
primary_diagnosis_vocabulary string,
primary_procedure_number string,
primary_procedure_vocabulary string,
travel_indicator string,
frequent_flier_flag string,
marketing_analytic_id string,
mrn_alias_pool_display string,
visit_id string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

 

 

add jar /home/MCH/jthatipa/backup/AutoIncLong123.jar;


create function INCREMENT_SEQID as 'com.mch.hive.hiveudf.MchAutoIncrementIdUDF';

set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;

set mapred.map.tasks = 01;

 


INSERT INTO TABLE udfautoincrement.w_encounter_inc
SELECT encounter_id,
INC_NUMBERID(cast(ENCOUNTER_ID as int), 1) as udf_updated_encounter_id,
medical_record_number,
financial_number,
patient_account_number,
ms_drg_id,
ms_drg_description,
apr_drg_id,
apr_drg_description,
rom_code_value,
apr_risk_of_mortality,
soi_code_value,
apr_severity_of_illness,
ms_mdc_code_id,
ms_mdc_code_description,
apr_mdc_code_id,
apr_mdc_code_description,
ms_drg_weight,
ms_gmlos,
ms_amlos,
primary_health_plan_id,
secondary_health_plan_id,
primary_plan_type_code,
default_guarantor_id,
family_member_id,
attending_physician_id,
admitting_physician_id,
referring_physician_id,
primary_care_physician_id,
consult_physician_1_id,
consult_physician_2_id,
consult_physician_3_id,
consult_physician_4_id,
admit_location_code,
admit_admit_type_code,
admit_location_facility_code,
admit_location_building_code,
admit_location_room_code,
admit_location_bed_code,
admit_arrive_date,
admit_depart_date,
disch_location_code,
disch_admit_type_code,
disch_location_facility_code,
disch_location_building_code,
disch_location_room_code,
disch_location_bed_code,
disch_arrive_date,
disch_depart_date,
race_code,
race_display,
ethnic_group_code,
ethnic_group,
language_code,
language_display,
sex_code,
sex,
religion_code,
religion_display,
person_id,
encounter_class_code,
encounter_type_code,
encounter_type_class_code,
encounter_status_code,
pre_reg_date,
pre_reg_personnel_id,
reg_date,
reg_personnel_id,
est_arrive_date,
est_depart_date,
arrive_date,
depart_date,
legacy_discharge_day,
admission_day,
admit_type_code,
admit_src_code,
admit_mode_code,
referring_comment,
disch_disposition_code,
disch_to_location_code,
accommodation_code,
accommodation_request_code,
ambulatory_cond_code,
courtesy_code,
isolation_code,
med_service_code,
confid_level_code,
location_code,
location_facility_code,
location_building_code,
location_nurse_unit_code,
location_room_code,
location_bed_code,
disch_date,
location_temp_code,
organization_id,
reason_for_visit,
encounter_financial_id,
financial_class_code,
trauma_code,
triage_date,
visitor_status_code,
refer_facility_code,
trauma_date,
accommodation_reason_code,
encounter_complete_date,
assign_to_location_date,
inpatient_admit_date,
legacy_admission_day,
initial_bill_day,
last_bill_cycle_day,
current_balance,
balance_of_unbilled_charges,
total_charges_for_account,
total_adj_for_this_account,
total_payments_for_account,
expected_payments,
last_payment_day,
last_payment_amount,
balance_on_last_statement,
total_costs,
total_fixed_costs,
total_variable_costs,
total_direct_costs,
total_indirect_costs,
surgery_start_date,
surgeon_physician_id,
dw_creation_date,
dw_update_date,
tertiary_health_plan_id,
quaternary_health_plan_id,
legacy_payor_id1,
legacy_payor_id2,
legacy_payor_id3,
legacy_payor_id4,
admission_date,
discharge_date,
pb_ins_payments_applied,
pb_ins_adjustments_applied,
pb_patient_payments_applied,
pb_patient_adjustments_applied,
pb_withhold_applied,
pb_approved_amount,
pb_value,
pb_value_balance,
pb_amount,
pb_balance,
pb_rvu_work,
primary_diagnosis_number,
primary_diagnosis_vocabulary,
primary_procedure_number,
primary_procedure_vocabulary,
travel_indicator,
frequent_flier_flag,
marketing_analytic_id,
mrn_alias_pool_display,
visit_id from udfautoincrement.w_encounter_may23 ;

 

Here is the Execution:

 

45900409 1 7954727
45900412 1 7954728
45900416 1 7954729
45900419 1 7954730
45900425 1 7954731
45900429 1 7954732
45900432 1 7954733
45900436 1 7954734
45900439 1 7954735
45900443 1 7954736
45900446 1 7954737
45900449 1 7954738
45900452 1 7954739
45900456 1 7954740
Time taken: 183.656 seconds, Fetched: 7954740 row(s)
hive> create table encounter_id_count_window as select encounter_id, count(encounter_id) over (partition by encounter_id) from w_encounter_seqid group by enc
Query ID = jthatipa_20170613123131_3478898c-1a41-4611-8741-612745621319
Total jobs = 2
Launching Job 1 out of 2
Number of reduce tasks not specified. Estimated from input data size: 134
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1481171450798_0411, Tracking URL = https://svrhdpm03.mch.corp.int:8090/proxy/application_1481171450798_0411/
Kill Command = /opt/cloudera/parcels/CDH-5.7.1-1.cdh5.7.1.p0.11/lib/hadoop/bin/hadoop job -kill job_1481171450798_0411
Hadoop job information for Stage-1: number of mappers: 32; number of reducers: 134
2017-06-13 12:31:20,482 Stage-1 map = 0%, reduce = 0%
2017-06-13 12:31:29,859 Stage-1 map = 3%, reduce = 0%, Cumulative CPU 3.82 sec
2017-06-13 12:31:31,947 Stage-1 map = 69%, reduce = 0%, Cumulative CPU 147.38 sec
2017-06-13 12:31:32,995 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 227.39 sec
2017-06-13 12:31:44,642 Stage-1 map = 100%, reduce = 1%, Cumulative CPU 233.21 sec
2017-06-13 12:31:45,862 Stage-1 map = 100%, reduce = 12%, Cumulative CPU 336.67 sec
2017-06-13 12:31:46,921 Stage-1 map = 100%, reduce = 93%, Cumulative CPU 1073.95 sec
2017-06-13 12:31:47,971 Stage-1 map = 100%, reduce = 98%, Cumulative CPU 1134.23 sec
2017-06-13 12:31:49,008 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 1155.42 sec
MapReduce Total cumulative CPU time: 19 minutes 15 seconds 420 msec
Ended Job = job_1481171450798_0411
Launching Job 2 out of 2
Number of reduce tasks not specified. Estimated from input data size: 3
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1481171450798_0412, Tracking URL = https://svrhdpm03.mch.corp.int:8090/proxy/application_1481171450798_0412/
Kill Command = /opt/cloudera/parcels/CDH-5.7.1-1.cdh5.7.1.p0.11/lib/hadoop/bin/hadoop job -kill job_1481171450798_0412
Hadoop job information for Stage-2: number of mappers: 2; number of reducers: 3
2017-06-13 12:32:02,433 Stage-2 map = 0%, reduce = 0%
2017-06-13 12:32:15,021 Stage-2 map = 37%, reduce = 0%, Cumulative CPU 21.26 sec
2017-06-13 12:32:18,149 Stage-2 map = 50%, reduce = 0%, Cumulative CPU 29.19 sec
2017-06-13 12:32:19,197 Stage-2 map = 67%, reduce = 0%, Cumulative CPU 30.15 sec
2017-06-13 12:32:21,287 Stage-2 map = 73%, reduce = 0%, Cumulative CPU 33.95 sec
2017-06-13 12:32:23,379 Stage-2 map = 79%, reduce = 0%, Cumulative CPU 37.9 sec
2017-06-13 12:32:26,518 Stage-2 map = 83%, reduce = 0%, Cumulative CPU 41.63 sec
2017-06-13 12:32:32,771 Stage-2 map = 100%, reduce = 0%, Cumulative CPU 48.1 sec
2017-06-13 12:32:44,258 Stage-2 map = 100%, reduce = 28%, Cumulative CPU 59.55 sec
2017-06-13 12:32:46,349 Stage-2 map = 100%, reduce = 33%, Cumulative CPU 63.23 sec
2017-06-13 12:32:47,393 Stage-2 map = 100%, reduce = 97%, Cumulative CPU 87.96 sec
2017-06-13 12:32:48,440 Stage-2 map = 100%, reduce = 100%, Cumulative CPU 89.67 sec
MapReduce Total cumulative CPU time: 1 minutes 29 seconds 670 msec
Ended Job = job_1481171450798_0412
Moving data to: hdfs://nameservice1/user/hive/warehouse/udfautoincrement.db/encounter_id_count_window
Table udfautoincrement.encounter_id_count_window stats: [numFiles=3, numRows=7954740, totalSize=85724875, rawDataSize=77770135]
MapReduce Jobs Launched:
Stage-Stage-1: Map: 32 Reduce: 134 Cumulative CPU: 1155.42 sec HDFS Read: 8945699403 HDFS Write: 172681731 SUCCESS
Stage-Stage-2: Map: 2 Reduce: 3 Cumulative CPU: 89.67 sec HDFS Read: 172732010 HDFS Write: 85725202 SUCCESS
Total MapReduce CPU Time Spent: 20 minutes 45 seconds 90 msec
OK
Time taken: 102.559 seconds
hive> describe encounter_id_count_window;
OK
encounter_id int
_wcol0 bigint
Time taken: 0.092 seconds, Fetched: 2 row(s)
hive> select * from encounter_id_count_window where _wcol0>1;
FAILED: ParseException line 1:52 missing \' at '>' near '<EOF>'
hive> select * from encounter_id_count_window where _wcol0 > 1;
FAILED: ParseException line 1:53 missing \' at '>' near '<EOF>'
hive> select * from encounter_id_count_window where `_wcol0` > 1;
Query ID = jthatipa_20170613124040_66521522-8841-4e40-8fca-d4eb9fae2bd6
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_1481171450798_0413, Tracking URL = https://svrhdpm03.mch.corp.int:8090/proxy/application_1481171450798_0413/
Kill Command = /opt/cloudera/parcels/CDH-5.7.1-1.cdh5.7.1.p0.11/lib/hadoop/bin/hadoop job -kill job_1481171450798_0413
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2017-06-13 12:40:14,641 Stage-1 map = 0%, reduce = 0%
2017-06-13 12:40:27,094 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 5.66 sec
MapReduce Total cumulative CPU time: 5 seconds 660 msec
Ended Job = job_1481171450798_0413
MapReduce Jobs Launched:
Stage-Stage-1: Map: 1 Cumulative CPU: 5.66 sec HDFS Read: 85728587 HDFS Write: 0 SUCCESS
Total MapReduce CPU Time Spent: 5 seconds 660 msec
OK
Time taken: 23.897 seconds
hive> select encounter_id,count(*) from w_encounter_seqid group by encounter_id having count(*)>1;
Query ID = jthatipa_20170613124343_354a244d-0134-498e-b988-66d9c1a9da16
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 134
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1481171450798_0414, Tracking URL = https://svrhdpm03.mch.corp.int:8090/proxy/application_1481171450798_0414/
Kill Command = /opt/cloudera/parcels/CDH-5.7.1-1.cdh5.7.1.p0.11/lib/hadoop/bin/hadoop job -kill job_1481171450798_0414
Hadoop job information for Stage-1: number of mappers: 32; number of reducers: 134
2017-06-13 12:43:28,242 Stage-1 map = 0%, reduce = 0%
2017-06-13 12:43:37,633 Stage-1 map = 3%, reduce = 0%, Cumulative CPU 4.13 sec
2017-06-13 12:43:39,737 Stage-1 map = 56%, reduce = 0%, Cumulative CPU 109.49 sec
2017-06-13 12:43:40,784 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 232.82 sec
2017-06-13 12:43:53,427 Stage-1 map = 100%, reduce = 22%, Cumulative CPU 423.33 sec
2017-06-13 12:43:54,478 Stage-1 map = 100%, reduce = 74%, Cumulative CPU 898.92 sec
2017-06-13 12:43:55,528 Stage-1 map = 100%, reduce = 94%, Cumulative CPU 1088.15 sec
2017-06-13 12:43:56,577 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 1140.97 sec
MapReduce Total cumulative CPU time: 19 minutes 0 seconds 970 msec
Ended Job = job_1481171450798_0414
MapReduce Jobs Launched:
Stage-Stage-1: Map: 32 Reduce: 134 Cumulative CPU: 1140.97 sec HDFS Read: 8945936979 HDFS Write: 0 SUCCESS
Total MapReduce CPU Time Spent: 19 minutes 0 seconds 970 msec
OK
Time taken: 42.543 seconds
hive>SELECT udf_updated_encounter_id, row_number() OVER (ORDER BY udf_updated_encounter_id) as row_number from w_encounter_seqid;

 

//some dat ais there before cant able to upload all data

9_6812064 7954733
9_7258820 7954734
9_7667038 7954735
9_8107154 7954736
9_8511292 7954737
9_8935037 7954738
9_9394735 7954739
9_9809614 7954740

 //Expecting result as 7954740_9809614 7954740 but getting 9_9809614 7954740.

 

please check all the code and process once and let me know the solution for this.