## 数据如果单节点硬盘存不下,可以将数据按月份分散到多台机器,修改 `ls 2017XXX`来处理数据 output="`pwd`/output" mkdir -p $output for f in `ls 2017*`;do go run index.go -file="$f" -out="`pwd`/output" done
# 过程sql select user_id, max((event_id=10001)) as step1_condition, sequenceMatch('(?1).*(?2)')(toDateTime(timestamp),(event_id=10003),(event_id=10004)) as step2_condition, sequenceMatch('(?1).*(?2).*(?3)')(toDateTime(timestamp),(event_id=10003),(event_id=10004),(event_id=10007)) as step3_condition from trend_event where event_date >= '2017-01-01' and event_date < '2017-01-15' groupby user_id limit10
1 2 3 4 5 6 7
SELECT sum(step1_condition) as step1_achieved, sum(step2_condition) as step2_achieved, sum(step3_condition) as step3_achieved FROM ( # 上述`过程sql` )
select user_id as userId, eventID, EventTimestamp from ( select user_id, groupArray(event_id) asevents, groupArray(timestamp) as times, arrayEnumerate(events) as events_index, arrayFilter(name,index -> (index=1) or (events[index] != events[index-1]),events,events_index) as events_filt, arrayFilter(time,index -> (index=1) or (events[index] != events[index-1]), times,events_index) as times_filt FROM (select * from trend_event where event_date='2017-01-30'orderbytimestamp) groupby user_id ) arrayJoin events_filt as eventID, times_filt as EventTimestamp limit10
SELECT sum(step1_condition) as step1_achieved, sum(step2_condition) as step2_achieved, sum(step3_condition) as step3_achieved FROM ( select user_id, max((event_id=10001)) as step1_condition, sequenceMatch('(?1).*(?2)')(toDateTime(timestamp),(event_id=10003),(event_id=10004)) as step2_condition, sequenceMatch('(?1).*(?2).*(?3)')(toDateTime(timestamp),(event_id=10003),(event_id=10004),(event_id=10007)) as step3_condition from ( select user_id, eventID as event_id, EventTimestamp astimestamp from ( select user_id, groupArray(event_id) asevents, groupArray(timestamp) as times, arrayEnumerate(events) as events_index, arrayFilter(name,index -> (index=1) or (events[index] != events[index-1]),events,events_index) as events_filt, arrayFilter(time,index -> (index=1) or (events[index] != events[index-1]), times,events_index) as times_filt FROM (select * from trend_event where event_date >= '2017-01-01'and event_date < '2017-01-02' orderbytimestamp) groupby user_id ) arrayJoin events_filt as eventID, times_filt as EventTimestamp ) groupby user_id )
SELECT sum(step1_condition) as step1_achieved, sum(step2_condition) as step2_achieved, sum(step3_condition) as step3_achieved, sum(step4_condition) as step4_achieved, sum(step5_condition) as step5_achieved FROM ( select user_id, max((event_id=10002)) as step1_condition, sequenceMatch('(?1).*(?2)')(toDateTime(timestamp),(event_id=10002),(event_id=10003and event_tag_content='Apple')) as step2_condition, sequenceMatch('(?1).*(?2).*(?3)')(toDateTime(timestamp),(event_id=10002),(event_id=10003and event_tag_content='Apple'),(event_id=10004and event_tag_price>5000)) as step3_condition, sequenceMatch('(?1).*(?2).*(?3).*(?4)')(toDateTime(timestamp),(event_id=10002),(event_id=10003and event_tag_content='Apple'),(event_id=10004and event_tag_price>5000),(event_id=10007)) as step4_condition, sequenceMatch('(?1).*(?2).*(?3).*(?4).*(?5)')(toDateTime(timestamp),(event_id=10002),(event_id=10003and event_tag_content='Apple'),(event_id=10004and event_tag_price>5000),(event_id=10007),(event_id=10008)) as step5_condition from trend_event where event_date >= '2017-01-01' and event_date < '2017-01-08' groupby user_id )