mirror of
				https://github.com/microsoft/autogen.git
				synced 2025-11-04 03:39:52 +00:00 
			
		
		
		
	* add basic support to Spark dataframe add support to SynapseML LightGBM model update to pyspark>=3.2.0 to leverage pandas_on_Spark API * clean code, add TODOs * add sample_train_data for pyspark.pandas dataframe, fix bugs * improve some functions, fix bugs * fix dict change size during iteration * update model predict * update LightGBM model, update test * update SynapseML LightGBM params * update synapseML and tests * update TODOs * Added support to roc_auc for spark models * Added support to score of spark estimator * Added test for automl score of spark estimator * Added cv support to pyspark.pandas dataframe * Update test, fix bugs * Added tests * Updated docs, tests, added a notebook * Fix bugs in non-spark env * Fix bugs and improve tests * Fix uninstall pyspark * Fix tests error * Fix java.lang.OutOfMemoryError: Java heap space * Fix test_performance * Update test_sparkml to test_0sparkml to use the expected spark conf * Remove unnecessary widgets in notebook * Fix iloc java.lang.StackOverflowError * fix pre-commit * Added params check for spark dataframes * Refactor code for train_test_split to a function * Update train_test_split_pyspark * Refactor if-else, remove unnecessary code * Remove y from predict, remove mem control from n_iter compute * Update workflow * Improve _split_pyspark * Fix test failure of too short training time * Fix typos, improve docstrings * Fix index errors of pandas_on_spark, add spark loss metric * Fix typo of ndcgAtK * Update NDCG metrics and tests * Remove unuseful logger * Use cache and count to ensure consistent indexes * refactor for merge maain * fix errors of refactor * Updated SparkLightGBMEstimator and cache * Updated config2params * Remove unused import * Fix unknown parameters * Update default_estimator_list * Add unit tests for spark metrics
		
			
				
	
	
		
			98 lines
		
	
	
		
			2.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			98 lines
		
	
	
		
			2.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
ParamList_LightGBM_Base = [
 | 
						|
    "baggingFraction",
 | 
						|
    "baggingFreq",
 | 
						|
    "baggingSeed",
 | 
						|
    "binSampleCount",
 | 
						|
    "boostFromAverage",
 | 
						|
    "boostingType",
 | 
						|
    "catSmooth",
 | 
						|
    "categoricalSlotIndexes",
 | 
						|
    "categoricalSlotNames",
 | 
						|
    "catl2",
 | 
						|
    "chunkSize",
 | 
						|
    "dataRandomSeed",
 | 
						|
    "defaultListenPort",
 | 
						|
    "deterministic",
 | 
						|
    "driverListenPort",
 | 
						|
    "dropRate",
 | 
						|
    "dropSeed",
 | 
						|
    "earlyStoppingRound",
 | 
						|
    "executionMode",
 | 
						|
    "extraSeed" "featureFraction",
 | 
						|
    "featureFractionByNode",
 | 
						|
    "featureFractionSeed",
 | 
						|
    "featuresCol",
 | 
						|
    "featuresShapCol",
 | 
						|
    "fobj" "improvementTolerance",
 | 
						|
    "initScoreCol",
 | 
						|
    "isEnableSparse",
 | 
						|
    "isProvideTrainingMetric",
 | 
						|
    "labelCol",
 | 
						|
    "lambdaL1",
 | 
						|
    "lambdaL2",
 | 
						|
    "leafPredictionCol",
 | 
						|
    "learningRate",
 | 
						|
    "matrixType",
 | 
						|
    "maxBin",
 | 
						|
    "maxBinByFeature",
 | 
						|
    "maxCatThreshold",
 | 
						|
    "maxCatToOnehot",
 | 
						|
    "maxDeltaStep",
 | 
						|
    "maxDepth",
 | 
						|
    "maxDrop",
 | 
						|
    "metric",
 | 
						|
    "microBatchSize",
 | 
						|
    "minDataInLeaf",
 | 
						|
    "minDataPerBin",
 | 
						|
    "minDataPerGroup",
 | 
						|
    "minGainToSplit",
 | 
						|
    "minSumHessianInLeaf",
 | 
						|
    "modelString",
 | 
						|
    "monotoneConstraints",
 | 
						|
    "monotoneConstraintsMethod",
 | 
						|
    "monotonePenalty",
 | 
						|
    "negBaggingFraction",
 | 
						|
    "numBatches",
 | 
						|
    "numIterations",
 | 
						|
    "numLeaves",
 | 
						|
    "numTasks",
 | 
						|
    "numThreads",
 | 
						|
    "objectiveSeed",
 | 
						|
    "otherRate",
 | 
						|
    "parallelism",
 | 
						|
    "passThroughArgs",
 | 
						|
    "posBaggingFraction",
 | 
						|
    "predictDisableShapeCheck",
 | 
						|
    "predictionCol",
 | 
						|
    "repartitionByGroupingColumn",
 | 
						|
    "seed",
 | 
						|
    "skipDrop",
 | 
						|
    "slotNames",
 | 
						|
    "timeout",
 | 
						|
    "topK",
 | 
						|
    "topRate",
 | 
						|
    "uniformDrop",
 | 
						|
    "useBarrierExecutionMode",
 | 
						|
    "useMissing",
 | 
						|
    "useSingleDatasetMode",
 | 
						|
    "validationIndicatorCol",
 | 
						|
    "verbosity",
 | 
						|
    "weightCol",
 | 
						|
    "xGBoostDartMode",
 | 
						|
    "zeroAsMissing",
 | 
						|
    "objective",
 | 
						|
]
 | 
						|
ParamList_LightGBM_Classifier = ParamList_LightGBM_Base + [
 | 
						|
    "isUnbalance",
 | 
						|
    "probabilityCol",
 | 
						|
    "rawPredictionCol",
 | 
						|
    "thresholds",
 | 
						|
]
 | 
						|
ParamList_LightGBM_Regressor = ParamList_LightGBM_Base + ["tweedieVariancePower"]
 | 
						|
ParamList_LightGBM_Ranker = ParamList_LightGBM_Base + [
 | 
						|
    "groupCol",
 | 
						|
    "evalAt",
 | 
						|
    "labelGain",
 | 
						|
    "maxPosition",
 | 
						|
]
 |