1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
| import sys def find_all_paths(edges, start_node): paths = [] stack = [(start_node, [start_node])] while stack: node, path = stack.pop() if node is None: paths.append(path[:-1]) continue paths.append(path) for child in edges.get(node, []): stack.append((child, path + [child])) stack.append((None, path)) return paths def main(): edges = {} for line in sys.stdin: parent, child = line.strip().split('\t') if parent == 'NULL': parent = None edges.setdefault(parent, []).append(child) all_paths = find_all_paths(edges, None) for path in all_paths: print('->'.join(path)) if __name__ == '__main__': main()
ADD FILE /path/to/find_paths.py; -- 替换为你的Python脚本的实际路径 SELECT TRANSFORM (parent, child) USING 'python find_paths.py' AS path FROM tree_data; ------------------------------------------===================================2
import json def find_all_paths(edges, start_node=None): """ Find all paths from the start_node to the leaves. If start_node is None, it starts from the root nodes (nodes without parent). """ adjacency_list = {} for edge in edges: parent, child = edge['parent'], edge['child'] adjacency_list.setdefault(parent, []).append(child) def dfs(node, path, paths): path.append(node) if node not in adjacency_list: paths.append(list(path)) else: for child in adjacency_list[node]: dfs(child, path, paths) path.pop() paths = [] if start_node: dfs(start_node, [], paths) else: root_nodes = set(adjacency_list.keys()) - set(child for children in adjacency_list.values() for child in children) for root in root_nodes: dfs(root, [], paths) return json.dumps(paths)
def tree_paths(edges_json): edges = json.loads(edges_json) return find_all_paths(edges)
ADD FILE hdfs:///path/to/your/tree_paths_udf.zip; -- 或者上传的Python可执行文件路径 CREATE TEMPORARY FUNCTION tree_paths_udf AS 'tree_paths' USING 'python tree_paths_udf.zip'; -- 根据你的上传文件类型调整这里
WITH edges AS ( SELECT CONCAT('{"parent": "', parent, '", "child": "', child, '"}') AS edge_json FROM parent_child_relations ) SELECT path FROM ( SELECT explode(split(regexp_replace(tree_paths_udf(concat('[', collect_list(edge_json), ']')), '\\\\"', '"'), ',')) AS path_json FROM edges ) t LATERAL VIEW json_tuple(path_json, 'value') exploded_table AS path;
============================================================
import sys
tree = {} def build_tree(parent_id, child_id): """ 构建树形结构 """ if parent_id not in tree: tree[parent_id] = [] tree[parent_id].append(child_id) def find_all_paths(node, path=[]): """ 递归查找所有从根到叶的路径 """ path = path + [node] if node not in tree: yield path for child in tree.get(node, []): yield from find_all_paths(child, path) def main(): """ 主函数,读取标准输入并输出所有路径 """ for line in sys.stdin: parent_id, child_id = line.strip().split() build_tree(parent_id, child_id) root_node = 'root' for path in find_all_paths(root_node): print('->'.join(path)) if __name__ == "__main__": main()
ADD FILE /path/to/find_paths.py; CREATE TABLE all_paths AS SELECT TRANSFORM (parent_id, child_id) USING 'python find_paths.py' AS path FROM parent_child_relations;
|