Support Questions
Find answers, ask questions, and share your expertise

How to extract specific keywords from a file using pyspark

How to extract specific keywords from a file using pyspark

Super Collaborator

I have a CSV with only two columns:

1st column: username

2nd column: Query run by the user

I want to extract the following from this file:

1. username

2. Query operation (Select, Insert, Delete etc)

3. SchemaName.TableName

Data Snippet:

sys_load_spoke,INSERT INTO emcas_gbs_analytics.opportunity_new(row_wid,id,accountid,name,opportunity_number__c,stagename,country__c,closedate,leadsource,createddate,createdbyid,lastmodifieddate,partner__c,tier_2_partner__c,competitor__c,originator__c,sales_channel__c,sales_force__c,additional_forecast_status__c,close_comments__c,compelling_event_why_now__c,deal_lead__c,ela__c,emc_s_unique_value_why_emc__c,next_steps_2__c,next_steps_last_updated_date__c,next_steps__c,opportunity_owner__c,party_number__c,quote_cart_number__c,quote_cart__c,quote_operating_unit__c,renewals_type__c,state__c,account_name1__c,quarter_revenue_forecast__c,emc_classification__c,competitor_lost_to__c,contract_modifier__c,quote_start_date__c,quote_end_date__c,contract_number__c,comments__c,account_theatertext__c,renewals_bu__c,opportunity_closed_won_date__c,opportunity_closed_date__c,ela2__c,renewals_timing__c,renewals_sales_stage__c,bill_to__c,closed_reason_action__c,discount_percent__c,hw_discount_percent__c,hw_tla_start_date__c,renewals_close_details__c,sw_discount_percent__c,booking_status__c,mco__c,rss_coordinator__c,quote_created_by__c,billing_frequency__c,po_number__c,manager_comments__c,t15_notification__c,t45_notification_sent_on__c,t45_notification__c,t90_notification_sent_on__c,t90_notification__c,redirects_reason_codes__c,executive_sponsor_key_players__c,deal_grouping_name__c,mced_hw__c,renewals_timing_hw__c,agreements__c,comments_details__c,at_risk__c,closed_reason__c,customer_feedback__c,no_notification_reason__c,no_notification__c,notification_comments__c,notification_contact__c,pricing_methodology__c,quote_for_refresh__c,renewals_health_flag__c,vsoe_compliant__c,disposition_code__c,opportunity_age__c,renewals_campaign__c,renewals_campaign_details__c,bill_to_ship_to_comments__c,w_insert_dt,w_update_dt,conversionrate,amount_usd,quote_amount_usd,program_name__c,w_row_hash,forecast_amount,currencyisocode,ownerid) VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50, $51, $52, $53, $54, $55, $56, $57, $58, $59, $60, $61, $62, $63, $64, $65, $66, $67, $68, $69, $70, $71, $72, $73, $74, $75, $76, $77, $78, $79, $80, $81, $82, $83, $84, $85, $86, $87, $88, $89, $90, $91, $92, $93, $94, $95, $96, $97, $98, $99, $100, $101, $102)

sys_load_spoke,INSERT INTO emcas_gbs_analytics.opportunity_new(row_wid,id,accountid,name,opportunity_number__c,stagename,country__c,closedate,leadsource,createddate,createdbyid,lastmodifieddate,partner__c,tier_2_partner__c,competitor__c,originator__c,sales_channel__c,sales_force__c,additional_forecast_status__c,close_comments__c,compelling_event_why_now__c,deal_lead__c,ela__c,emc_s_unique_value_why_emc__c,next_steps_2__c,next_steps_last_updated_date__c,next_steps__c,opportunity_owner__c,party_number__c,quote_cart_number__c,quote_cart__c,quote_operating_unit__c,renewals_type__c,state__c,account_name1__c,quarter_revenue_forecast__c,emc_classification__c,competitor_lost_to__c,contract_modifier__c,quote_start_date__c,quote_end_date__c,contract_number__c,comments__c,account_theatertext__c,renewals_bu__c,opportunity_closed_won_date__c,opportunity_closed_date__c,ela2__c,renewals_timing__c,renewals_sales_stage__c,bill_to__c,closed_reason_action__c,discount_percent__c,hw_discount_percent__c,hw_tla_start_date__c,renewals_close_details__c,sw_discount_percent__c,booking_status__c,mco__c,rss_coordinator__c,quote_created_by__c,billing_frequency__c,po_number__c,manager_comments__c,t15_notification__c,t45_notification_sent_on__c,t45_notification__c,t90_notification_sent_on__c,t90_notification__c,redirects_reason_codes__c,executive_sponsor_key_players__c,deal_grouping_name__c,mced_hw__c,renewals_timing_hw__c,agreements__c,comments_details__c,at_risk__c,closed_reason__c,customer_feedback__c,no_notification_reason__c,no_notification__c,notification_comments__c,notification_contact__c,pricing_methodology__c,quote_for_refresh__c,renewals_health_flag__c,vsoe_compliant__c,disposition_code__c,opportunity_age__c,renewals_campaign__c,renewals_campaign_details__c,bill_to_ship_to_comments__c,w_insert_dt,w_update_dt,conversionrate,amount_usd,quote_amount_usd,program_name__c,w_row_hash,forecast_amount,currencyisocode,ownerid) VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50, $51, $52, $53, $54, $55, $56, $57, $58, $59, $60, $61, $62, $63, $64, $65, $66, $67, $68, $69, $70, $71, $72, $73, $74, $75, $76, $77, $78, $79, $80, $81, $82, $83, $84, $85, $86, $87, $88, $89, $90, $91, $92, $93, $94, $95, $96, $97, $98, $99, $100, $101, $102)